Coverage for build_validation_dataset.py: 55%
288 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-21 23:06 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-21 23:06 +0000
1# Copyright 2026 venim1103
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
15"""Build a portable held-out validation dataset for pretraining/eval runs.
17The default preset expands the original 3-pillar idea into 5 evaluation pillars:
18math, logic, code, tool use, and web text. This is intentionally evaluation-side
19granularity: the current pretraining recipe in train.py has 4 top-level buckets,
20with math and logic both feeding the formal_logic domain.
22Design goals:
23- Run the same way locally, in Colab, and in Kaggle.
24- Avoid Colab-only secret handling.
25- Keep the source plan explicit and configurable.
26- Emit a manifest so downstream validation runs know exactly what was built.
28Example:
29 python build_validation_dataset.py --output-dir val_data
30 python build_validation_dataset.py --upload-kaggle \
31 --kaggle-dataset-id your-name/mini-mamba-1b58-validation
32"""
34from __future__ import annotations
36import argparse
37import importlib
38import itertools
39import json
40import os
41import subprocess
42from collections import deque
43from datetime import datetime, timezone
44from pathlib import Path
46from datasets import Dataset, load_dataset, load_dataset_builder
47from huggingface_hub import HfFileSystem
50DEFAULT_PRESET = "balanced_5pillar"
51DEFAULT_ROWS_PER_PILLAR = 900
52DEFAULT_FINEWEB_GLOB = "datasets/HuggingFaceFW/fineweb-edu/data/*.parquet"
53DEFAULT_FINEWEB_GLOB_CANDIDATES = (
54 "datasets/HuggingFaceFW/fineweb-edu/data/*.parquet",
55 "datasets/HuggingFaceFW/fineweb-edu/*/*/*.parquet",
56 "datasets/HuggingFaceFW/fineweb-edu/**/**/*.parquet",
57)
60def _build_default_specs(rows_per_pillar=DEFAULT_ROWS_PER_PILLAR, fineweb_shard_index=500):
61 return [
62 {
63 "name": "math",
64 "output_subdir": "math",
65 "output_file": "gsm8k_test.parquet",
66 "source_type": "hf_dataset",
67 "dataset": "gsm8k",
68 "config": "main",
69 "split": "test",
70 "selection": "head",
71 "rows": rows_per_pillar,
72 "notes": "External math reasoning probe.",
73 },
74 {
75 "name": "logic",
76 "output_subdir": "logic",
77 "output_file": "sciq_validation.parquet",
78 "source_type": "hf_dataset",
79 "dataset": "allenai/sciq",
80 "split": "validation",
81 "selection": "head",
82 "rows": rows_per_pillar,
83 "notes": (
84 "Reasoning/science proxy for the repo's broader formal_logic bucket. "
85 "Override via --config-file if you want a stricter logic benchmark."
86 ),
87 },
88 {
89 "name": "code",
90 "output_subdir": "code",
91 "output_file": "mbpp_test.parquet",
92 "source_type": "hf_dataset",
93 "dataset": "mbpp",
94 "split": "test",
95 "selection": "head",
96 "rows": rows_per_pillar,
97 "max_rows_hint": 500,
98 "notes": "Held-out code generation benchmark.",
99 },
100 {
101 "name": "tools",
102 "output_subdir": "tools",
103 "output_file": "glaive_function_calling_tail.parquet",
104 "source_type": "hf_dataset",
105 "dataset": "glaiveai/glaive-function-calling-v2",
106 "split": "train",
107 "selection": "tail",
108 "rows": rows_per_pillar,
109 "notes": "Tail slice from a tool-calling corpus external to the repo's Toolformer source.",
110 },
111 {
112 "name": "web",
113 "output_subdir": "web",
114 "output_file": f"fineweb_edu_shard_{fineweb_shard_index:04d}.parquet",
115 "source_type": "fineweb_glob",
116 "file_glob": DEFAULT_FINEWEB_GLOB,
117 "shard_index": fineweb_shard_index,
118 "selection": "head",
119 "rows": rows_per_pillar,
120 "notes": (
121 "General web-text probe sourced from a deep full-dataset shard. "
122 "This reduces overlap risk with sample-10BT training data but does not prove zero overlap."
123 ),
124 },
125 ]
128PRESET_REGISTRY = {
129 DEFAULT_PRESET: {
130 "description": (
131 "Balanced 5-pillar validation set spanning math, logic, code, tool use, and web text."
132 ),
133 "builder": _build_default_specs,
134 }
135}
138def _resolve_selection_indices(total_rows, requested_rows, selection):
139 if requested_rows <= 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 raise ValueError("requested_rows must be > 0")
141 if total_rows <= 0: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true
142 raise ValueError("total_rows must be > 0")
143 if requested_rows > total_rows:
144 raise ValueError(
145 f"Requested {requested_rows} rows from a dataset with only {total_rows} rows."
146 )
147 if selection == "head":
148 return list(range(requested_rows))
149 if selection == "tail": 149 ↛ 151line 149 didn't jump to line 151 because the condition on line 149 was always true
150 return list(range(total_rows - requested_rows, total_rows))
151 raise ValueError(f"Unsupported selection strategy: {selection!r}")
154def _load_specs_from_config(config_file, default_rows_per_pillar):
155 with open(config_file, "r", encoding="utf-8") as handle:
156 raw = json.load(handle)
158 if isinstance(raw, list):
159 specs = raw
160 elif isinstance(raw, dict): 160 ↛ 163line 160 didn't jump to line 163 because the condition on line 160 was always true
161 specs = raw.get("pillars", raw)
162 else:
163 raise ValueError("Config file must contain a JSON list or object.")
165 if not isinstance(specs, list): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 raise ValueError("Config file must be a JSON list or an object with a 'pillars' list.")
168 normalized = []
169 for spec in specs:
170 if not isinstance(spec, dict): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true
171 raise ValueError("Each pillar spec must be a JSON object.")
172 updated = dict(spec)
173 updated.setdefault("rows", default_rows_per_pillar)
174 updated.setdefault("selection", "head")
175 normalized.append(updated)
176 return normalized
179def _validate_specs(specs):
180 common_required = ("name", "output_subdir", "output_file", "source_type", "rows", "selection")
182 for idx, spec in enumerate(specs): 182 ↛ exitline 182 didn't return from function '_validate_specs' because the loop on line 182 didn't complete
183 missing_common = [key for key in common_required if key not in spec]
184 if missing_common:
185 raise ValueError(f"Pillar spec #{idx} missing required keys: {', '.join(sorted(missing_common))}")
187 try:
188 rows = int(spec["rows"])
189 except (TypeError, ValueError) as exc:
190 raise ValueError(f"Pillar spec #{idx} has non-integer rows={spec['rows']!r}") from exc
191 if rows <= 0: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 raise ValueError(f"Pillar spec #{idx} must have rows > 0 (got {rows}).")
194 source_type = spec["source_type"]
195 if source_type == "hf_dataset": 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 required = ("dataset", "split")
197 elif source_type == "fineweb_glob": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true
198 required = ("file_glob",)
199 else:
200 raise ValueError(f"Pillar spec #{idx} has unsupported source_type={source_type!r}")
202 missing_source = [key for key in required if key not in spec]
203 if missing_source:
204 raise ValueError(
205 f"Pillar spec #{idx} ({spec['name']!r}) missing source keys: {', '.join(sorted(missing_source))}"
206 )
209def _rebalance_rows_to_min_capacity(specs):
210 raise NotImplementedError("Use _rebalance_rows_to_min_capacity_with_probe instead.")
213def _probe_hf_dataset_capacity(spec, hf_token):
214 """Return split capacity (num_examples) without materializing the dataset."""
215 dataset_name = spec["dataset"]
216 config_name = spec.get("config")
217 split_name = spec["split"]
219 try:
220 if config_name:
221 builder = load_dataset_builder(dataset_name, config_name, token=hf_token)
222 else:
223 builder = load_dataset_builder(dataset_name, token=hf_token)
224 except Exception:
225 return None
227 split_info = getattr(builder.info, "splits", {}).get(split_name)
228 if split_info is None:
229 return None
230 num_examples = getattr(split_info, "num_examples", None)
231 if num_examples is None:
232 return None
233 try:
234 return int(num_examples)
235 except (TypeError, ValueError):
236 return None
239def _rebalance_rows_to_min_capacity_with_probe(specs, hf_token=None, probe_capacities=True):
240 """Balance row budgets using requested rows, hints, and optional split probing.
242 Returns:
243 balanced_rows: int
244 limiting: list[dict] entries for pillars that determined the minimum
245 details: list[dict] per-pillar effective limits and reasons
246 """
247 details = []
248 for spec in specs:
249 requested_rows = int(spec["rows"])
250 effective_rows = requested_rows
251 limiters = [f"requested={requested_rows}"]
253 hint = spec.get("max_rows_hint")
254 if hint is not None:
255 hint_int = int(hint)
256 if hint_int < effective_rows: 256 ↛ 258line 256 didn't jump to line 258 because the condition on line 256 was always true
257 effective_rows = hint_int
258 limiters.append(f"max_rows_hint={hint_int}")
260 if probe_capacities and spec.get("source_type") == "hf_dataset":
261 probed = _probe_hf_dataset_capacity(spec, hf_token)
262 if probed is not None: 262 ↛ 267line 262 didn't jump to line 267 because the condition on line 262 was always true
263 if probed < effective_rows: 263 ↛ 265line 263 didn't jump to line 265 because the condition on line 263 was always true
264 effective_rows = probed
265 limiters.append(f"probed_split_size={probed}")
267 details.append(
268 {
269 "name": spec.get("name", "unknown"),
270 "effective_rows": effective_rows,
271 "limiters": limiters,
272 }
273 )
275 balanced_rows = min(d["effective_rows"] for d in details)
276 limiting = [d for d in details if d["effective_rows"] == balanced_rows]
278 for spec in specs:
279 spec["rows"] = balanced_rows
281 return balanced_rows, limiting, details
284def _maybe_load_kaggle_secrets(secret_names=("HF_TOKEN", "KAGGLE_USERNAME", "KAGGLE_KEY")):
285 try:
286 kaggle_secrets = importlib.import_module("kaggle_secrets")
287 except ImportError:
288 return {}
290 client = kaggle_secrets.UserSecretsClient()
291 loaded = {}
292 for secret_name in secret_names:
293 if os.environ.get(secret_name):
294 continue
295 try:
296 value = client.get_secret(secret_name)
297 except Exception:
298 value = None
299 if value: 299 ↛ 292line 299 didn't jump to line 292 because the condition on line 299 was always true
300 os.environ[secret_name] = value
301 loaded[secret_name] = "loaded"
302 return loaded
305def _resolve_hf_token(env_var_name):
306 return os.environ.get(env_var_name) or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
309def _streaming_select_rows(stream, rows, selection):
310 if selection == "head":
311 records = list(itertools.islice(stream, rows))
312 return records, len(records)
314 if selection == "tail": 314 ↛ 322line 314 didn't jump to line 322 because the condition on line 314 was always true
315 buffer = deque(maxlen=rows)
316 total_rows = 0
317 for item in stream:
318 buffer.append(item)
319 total_rows += 1
320 return list(buffer), total_rows
322 raise ValueError(f"Unsupported selection strategy: {selection!r}")
325def _load_remote_dataset(spec, hf_token, allow_partial=False):
326 dataset_name = spec["dataset"]
327 config_name = spec.get("config")
328 split_name = spec["split"]
329 rows = int(spec["rows"])
330 selection = spec.get("selection", "head")
332 load_kwargs = {
333 "split": split_name,
334 "token": hf_token,
335 "streaming": True,
336 }
337 if config_name: 337 ↛ 338line 337 didn't jump to line 338 because the condition on line 337 was never true
338 stream = load_dataset(dataset_name, config_name, **load_kwargs)
339 else:
340 stream = load_dataset(dataset_name, **load_kwargs)
342 records, total_rows_seen = _streaming_select_rows(stream, rows, selection)
343 actual_rows = len(records)
344 if rows > actual_rows and not allow_partial:
345 raise ValueError(
346 f"Pillar {spec['name']!r} requested {rows} rows, but {dataset_name}:{split_name} only yielded {actual_rows}."
347 )
348 if not records: 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true
349 raise ValueError(
350 f"Pillar {spec['name']!r} yielded zero rows from {dataset_name}:{split_name}."
351 )
353 return Dataset.from_list(records), {
354 "resolved_split": split_name,
355 "selection": selection,
356 "requested_rows": int(spec["rows"]),
357 "actual_rows": actual_rows,
358 "total_rows_seen": total_rows_seen,
359 "streaming": True,
360 }
363def _load_fineweb_shard(spec, hf_token, allow_partial=False):
364 rows = int(spec["rows"])
365 shard_index = int(spec.get("shard_index", 0))
366 fs = HfFileSystem(token=hf_token)
368 parquet_files = []
369 tried_globs = []
370 candidate_globs = [spec.get("file_glob", DEFAULT_FINEWEB_GLOB)]
371 for fallback in DEFAULT_FINEWEB_GLOB_CANDIDATES:
372 if fallback not in candidate_globs:
373 candidate_globs.append(fallback)
375 for pattern in candidate_globs:
376 tried_globs.append(pattern)
377 parquet_files = sorted(fs.glob(pattern))
378 if parquet_files:
379 matched_glob = pattern
380 break
381 else:
382 matched_glob = None
384 if not parquet_files:
385 raise RuntimeError(
386 "No FineWeb parquet files matched any known glob pattern. "
387 f"Tried: {tried_globs}"
388 )
390 if shard_index < 0: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true
391 shard_index += len(parquet_files)
392 if shard_index < 0 or shard_index >= len(parquet_files): 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true
393 raise ValueError(
394 f"Shard index {shard_index} is out of range for {len(parquet_files)} files."
395 )
397 resolved_file = "hf://" + parquet_files[shard_index]
398 stream = load_dataset(
399 "parquet",
400 data_files=resolved_file,
401 split="train",
402 streaming=True,
403 token=hf_token,
404 )
405 records = list(itertools.islice(stream, rows))
406 if not records: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true
407 raise ValueError(
408 f"Pillar {spec['name']!r} produced zero rows from shard {shard_index}. "
409 "Pick a different shard or reduce --rows-per-pillar."
410 )
411 if len(records) < rows and not allow_partial: 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true
412 raise ValueError(
413 f"Pillar {spec['name']!r} requested {rows} rows, but shard only produced {len(records)}."
414 )
415 return Dataset.from_list(records), {
416 "resolved_file": resolved_file,
417 "selection": "head",
418 "requested_rows": rows,
419 "actual_rows": len(records),
420 "shard_count": len(parquet_files),
421 "shard_index": shard_index,
422 "matched_glob": matched_glob,
423 "tried_globs": tried_globs,
424 }
427def _materialize_pillar(spec, output_dir, hf_token, allow_partial=False):
428 source_type = spec["source_type"]
429 if source_type == "hf_dataset":
430 dataset, source_meta = _load_remote_dataset(spec, hf_token, allow_partial=allow_partial)
431 elif source_type == "fineweb_glob":
432 dataset, source_meta = _load_fineweb_shard(spec, hf_token, allow_partial=allow_partial)
433 else:
434 raise ValueError(f"Unsupported source_type: {source_type!r}")
436 pillar_dir = Path(output_dir) / spec["output_subdir"]
437 pillar_dir.mkdir(parents=True, exist_ok=True)
438 out_path = pillar_dir / spec["output_file"]
439 dataset.to_parquet(str(out_path))
441 return {
442 "name": spec["name"],
443 "path": str(out_path),
444 "rows": int(dataset.num_rows),
445 "source_type": source_type,
446 "source": {k: v for k, v in spec.items() if k not in {"output_subdir", "output_file"}},
447 "source_meta": source_meta,
448 "notes": spec.get("notes", ""),
449 }
452def _write_manifest(output_dir, manifest):
453 manifest_path = Path(output_dir) / "validation_manifest.json"
454 with open(manifest_path, "w", encoding="utf-8") as handle:
455 json.dump(manifest, handle, indent=2)
456 return manifest_path
459def _write_kaggle_metadata(output_dir, dataset_id, title, license_name="other"):
460 metadata = {
461 "title": title,
462 "id": dataset_id,
463 "licenses": [{"name": license_name}],
464 }
465 metadata_path = Path(output_dir) / "dataset-metadata.json"
466 with open(metadata_path, "w", encoding="utf-8") as handle:
467 json.dump(metadata, handle, indent=2)
468 return metadata_path
471def _run_kaggle_cli(output_dir, *, update_existing, dir_mode, message=None):
472 base_cmd = ["kaggle", "datasets"]
473 if update_existing:
474 cmd = base_cmd + ["version", "-p", str(output_dir), "--dir-mode", dir_mode]
475 if message:
476 cmd += ["-m", message]
477 else:
478 cmd = base_cmd + ["create", "-p", str(output_dir), "--dir-mode", dir_mode]
479 result = subprocess.run(cmd, check=False)
480 if result.returncode != 0:
481 raise RuntimeError(f"Kaggle CLI command failed with exit code {result.returncode}: {' '.join(cmd)}")
484def parse_args():
485 parser = argparse.ArgumentParser(description="Build a portable validation dataset bundle.")
486 parser.add_argument("--output-dir", default="val_data", help="Output directory for parquet files and manifest.")
487 parser.add_argument(
488 "--preset",
489 default=DEFAULT_PRESET,
490 choices=sorted(PRESET_REGISTRY),
491 help="Built-in source plan to use when --config-file is not provided.",
492 )
493 parser.add_argument(
494 "--config-file",
495 help="Optional JSON config overriding the built-in preset. Accepts a list or {'pillars': [...]}.",
496 )
497 parser.add_argument(
498 "--rows-per-pillar",
499 type=int,
500 default=DEFAULT_ROWS_PER_PILLAR,
501 help="Default number of rows per pillar for built-in presets.",
502 )
503 parser.add_argument(
504 "--rebalance-to-min",
505 action=argparse.BooleanOptionalAction,
506 default=True,
507 help=(
508 "Keep pillar sizes balanced by lowering all row counts to the smallest "
509 "feasible target across selected sources."
510 ),
511 )
512 parser.add_argument(
513 "--probe-capacities",
514 action=argparse.BooleanOptionalAction,
515 default=True,
516 help=(
517 "Probe HF dataset split capacities (num_examples) to choose a safe "
518 "balanced row target before building pillars."
519 ),
520 )
521 parser.add_argument(
522 "--fineweb-shard-index",
523 type=int,
524 default=500,
525 help="Shard index used by the built-in FineWeb pillar.",
526 )
527 parser.add_argument(
528 "--hf-token-env",
529 default="HF_TOKEN",
530 help="Environment variable name that stores the Hugging Face token.",
531 )
532 parser.add_argument(
533 "--allow-partial",
534 action=argparse.BooleanOptionalAction,
535 default=False,
536 help="Allow a pillar to emit fewer rows than requested if the source is smaller.",
537 )
538 parser.add_argument(
539 "--load-kaggle-secrets",
540 action=argparse.BooleanOptionalAction,
541 default=False,
542 help="On Kaggle notebooks, populate missing HF/Kaggle env vars from kaggle_secrets.",
543 )
544 parser.add_argument(
545 "--upload-kaggle",
546 action=argparse.BooleanOptionalAction,
547 default=False,
548 help="Create or update a Kaggle dataset after building the files.",
549 )
550 parser.add_argument("--kaggle-dataset-id", help="Kaggle dataset id in the form owner/slug.")
551 parser.add_argument(
552 "--kaggle-title",
553 default="Mini Mamba 1.58b Validation Set",
554 help="Human-readable Kaggle dataset title.",
555 )
556 parser.add_argument(
557 "--kaggle-license",
558 default="other",
559 help="Kaggle metadata license code (for mixed external sources use 'other').",
560 )
561 parser.add_argument(
562 "--kaggle-update",
563 action=argparse.BooleanOptionalAction,
564 default=False,
565 help="Use 'kaggle datasets version' instead of 'kaggle datasets create'.",
566 )
567 parser.add_argument(
568 "--kaggle-dir-mode",
569 choices=["skip", "zip", "tar"],
570 default="zip",
571 help="Packaging mode forwarded to the Kaggle CLI.",
572 )
573 parser.add_argument(
574 "--kaggle-message",
575 default="Refresh validation bundle",
576 help="Version message used when --kaggle-update is enabled.",
577 )
578 return parser.parse_args()
581def main():
582 args = parse_args()
584 if args.load_kaggle_secrets:
585 loaded = _maybe_load_kaggle_secrets()
586 if loaded:
587 print(f"Loaded missing Kaggle secrets: {', '.join(sorted(loaded))}")
589 output_dir = Path(args.output_dir)
590 output_dir.mkdir(parents=True, exist_ok=True)
592 if args.config_file:
593 specs = _load_specs_from_config(args.config_file, args.rows_per_pillar)
594 preset_name = "custom"
595 preset_description = f"Custom config loaded from {args.config_file}"
596 else:
597 preset = PRESET_REGISTRY[args.preset]
598 specs = preset["builder"](
599 rows_per_pillar=args.rows_per_pillar,
600 fineweb_shard_index=args.fineweb_shard_index,
601 )
602 preset_name = args.preset
603 preset_description = preset["description"]
605 _validate_specs(specs)
607 if args.rebalance_to_min:
608 balanced_rows, limiting, _details = _rebalance_rows_to_min_capacity_with_probe(
609 specs,
610 hf_token=_resolve_hf_token(args.hf_token_env),
611 probe_capacities=args.probe_capacities,
612 )
613 print(f"Using balanced row target per pillar: {balanced_rows}")
614 for entry in limiting:
615 print(f" constrained by {entry['name']}: {', '.join(entry['limiters'])}")
617 hf_token = _resolve_hf_token(args.hf_token_env)
618 results = []
619 for spec in specs:
620 print(f"Building pillar: {spec['name']} -> {spec['output_subdir']}/{spec['output_file']}")
621 result = _materialize_pillar(spec, output_dir, hf_token, allow_partial=args.allow_partial)
622 print(f" wrote {result['rows']} rows to {result['path']}")
623 results.append(result)
625 manifest = {
626 "created_at": datetime.now(timezone.utc).isoformat(),
627 "preset": preset_name,
628 "preset_description": preset_description,
629 "rows_per_pillar_default": args.rows_per_pillar,
630 "training_alignment_notes": [
631 "Pretraining mix is defined in train.py, not context_config.py.",
632 "The current pretraining recipe has 4 top-level buckets; this validation plan intentionally splits formal_logic into separate math and logic probes.",
633 "Balanced 1:1:1:1:1 pillar weighting is an evaluation choice, not a training-mixture requirement.",
634 "Deep-shard FineWeb sampling lowers overlap risk with sample-10BT training data but is not a formal zero-overlap proof.",
635 ],
636 "pillars": results,
637 }
638 manifest_path = _write_manifest(output_dir, manifest)
639 print(f"Wrote manifest: {manifest_path}")
641 if args.upload_kaggle:
642 if not args.kaggle_dataset_id:
643 raise ValueError("--kaggle-dataset-id is required when --upload-kaggle is enabled.")
644 metadata_path = _write_kaggle_metadata(
645 output_dir,
646 args.kaggle_dataset_id,
647 args.kaggle_title,
648 license_name=args.kaggle_license,
649 )
650 print(f"Wrote Kaggle metadata: {metadata_path}")
651 _run_kaggle_cli(
652 output_dir,
653 update_existing=args.kaggle_update,
654 dir_mode=args.kaggle_dir_mode,
655 message=args.kaggle_message,
656 )
657 print("Kaggle upload completed.")
660if __name__ == "__main__":
661 main()