Coverage for build_validation_dataset.py: 55%

288 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-21 23:06 +0000

1# Copyright 2026 venim1103 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15"""Build a portable held-out validation dataset for pretraining/eval runs. 

16 

17The default preset expands the original 3-pillar idea into 5 evaluation pillars: 

18math, logic, code, tool use, and web text. This is intentionally evaluation-side 

19granularity: the current pretraining recipe in train.py has 4 top-level buckets, 

20with math and logic both feeding the formal_logic domain. 

21 

22Design goals: 

23- Run the same way locally, in Colab, and in Kaggle. 

24- Avoid Colab-only secret handling. 

25- Keep the source plan explicit and configurable. 

26- Emit a manifest so downstream validation runs know exactly what was built. 

27 

28Example: 

29 python build_validation_dataset.py --output-dir val_data 

30 python build_validation_dataset.py --upload-kaggle \ 

31 --kaggle-dataset-id your-name/mini-mamba-1b58-validation 

32""" 

33 

34from __future__ import annotations 

35 

36import argparse 

37import importlib 

38import itertools 

39import json 

40import os 

41import subprocess 

42from collections import deque 

43from datetime import datetime, timezone 

44from pathlib import Path 

45 

46from datasets import Dataset, load_dataset, load_dataset_builder 

47from huggingface_hub import HfFileSystem 

48 

49 

50DEFAULT_PRESET = "balanced_5pillar" 

51DEFAULT_ROWS_PER_PILLAR = 900 

52DEFAULT_FINEWEB_GLOB = "datasets/HuggingFaceFW/fineweb-edu/data/*.parquet" 

53DEFAULT_FINEWEB_GLOB_CANDIDATES = ( 

54 "datasets/HuggingFaceFW/fineweb-edu/data/*.parquet", 

55 "datasets/HuggingFaceFW/fineweb-edu/*/*/*.parquet", 

56 "datasets/HuggingFaceFW/fineweb-edu/**/**/*.parquet", 

57) 

58 

59 

60def _build_default_specs(rows_per_pillar=DEFAULT_ROWS_PER_PILLAR, fineweb_shard_index=500): 

61 return [ 

62 { 

63 "name": "math", 

64 "output_subdir": "math", 

65 "output_file": "gsm8k_test.parquet", 

66 "source_type": "hf_dataset", 

67 "dataset": "gsm8k", 

68 "config": "main", 

69 "split": "test", 

70 "selection": "head", 

71 "rows": rows_per_pillar, 

72 "notes": "External math reasoning probe.", 

73 }, 

74 { 

75 "name": "logic", 

76 "output_subdir": "logic", 

77 "output_file": "sciq_validation.parquet", 

78 "source_type": "hf_dataset", 

79 "dataset": "allenai/sciq", 

80 "split": "validation", 

81 "selection": "head", 

82 "rows": rows_per_pillar, 

83 "notes": ( 

84 "Reasoning/science proxy for the repo's broader formal_logic bucket. " 

85 "Override via --config-file if you want a stricter logic benchmark." 

86 ), 

87 }, 

88 { 

89 "name": "code", 

90 "output_subdir": "code", 

91 "output_file": "mbpp_test.parquet", 

92 "source_type": "hf_dataset", 

93 "dataset": "mbpp", 

94 "split": "test", 

95 "selection": "head", 

96 "rows": rows_per_pillar, 

97 "max_rows_hint": 500, 

98 "notes": "Held-out code generation benchmark.", 

99 }, 

100 { 

101 "name": "tools", 

102 "output_subdir": "tools", 

103 "output_file": "glaive_function_calling_tail.parquet", 

104 "source_type": "hf_dataset", 

105 "dataset": "glaiveai/glaive-function-calling-v2", 

106 "split": "train", 

107 "selection": "tail", 

108 "rows": rows_per_pillar, 

109 "notes": "Tail slice from a tool-calling corpus external to the repo's Toolformer source.", 

110 }, 

111 { 

112 "name": "web", 

113 "output_subdir": "web", 

114 "output_file": f"fineweb_edu_shard_{fineweb_shard_index:04d}.parquet", 

115 "source_type": "fineweb_glob", 

116 "file_glob": DEFAULT_FINEWEB_GLOB, 

117 "shard_index": fineweb_shard_index, 

118 "selection": "head", 

119 "rows": rows_per_pillar, 

120 "notes": ( 

121 "General web-text probe sourced from a deep full-dataset shard. " 

122 "This reduces overlap risk with sample-10BT training data but does not prove zero overlap." 

123 ), 

124 }, 

125 ] 

126 

127 

128PRESET_REGISTRY = { 

129 DEFAULT_PRESET: { 

130 "description": ( 

131 "Balanced 5-pillar validation set spanning math, logic, code, tool use, and web text." 

132 ), 

133 "builder": _build_default_specs, 

134 } 

135} 

136 

137 

138def _resolve_selection_indices(total_rows, requested_rows, selection): 

139 if requested_rows <= 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 raise ValueError("requested_rows must be > 0") 

141 if total_rows <= 0: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 raise ValueError("total_rows must be > 0") 

143 if requested_rows > total_rows: 

144 raise ValueError( 

145 f"Requested {requested_rows} rows from a dataset with only {total_rows} rows." 

146 ) 

147 if selection == "head": 

148 return list(range(requested_rows)) 

149 if selection == "tail": 149 ↛ 151line 149 didn't jump to line 151 because the condition on line 149 was always true

150 return list(range(total_rows - requested_rows, total_rows)) 

151 raise ValueError(f"Unsupported selection strategy: {selection!r}") 

152 

153 

154def _load_specs_from_config(config_file, default_rows_per_pillar): 

155 with open(config_file, "r", encoding="utf-8") as handle: 

156 raw = json.load(handle) 

157 

158 if isinstance(raw, list): 

159 specs = raw 

160 elif isinstance(raw, dict): 160 ↛ 163line 160 didn't jump to line 163 because the condition on line 160 was always true

161 specs = raw.get("pillars", raw) 

162 else: 

163 raise ValueError("Config file must contain a JSON list or object.") 

164 

165 if not isinstance(specs, list): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 raise ValueError("Config file must be a JSON list or an object with a 'pillars' list.") 

167 

168 normalized = [] 

169 for spec in specs: 

170 if not isinstance(spec, dict): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 raise ValueError("Each pillar spec must be a JSON object.") 

172 updated = dict(spec) 

173 updated.setdefault("rows", default_rows_per_pillar) 

174 updated.setdefault("selection", "head") 

175 normalized.append(updated) 

176 return normalized 

177 

178 

179def _validate_specs(specs): 

180 common_required = ("name", "output_subdir", "output_file", "source_type", "rows", "selection") 

181 

182 for idx, spec in enumerate(specs): 182 ↛ exitline 182 didn't return from function '_validate_specs' because the loop on line 182 didn't complete

183 missing_common = [key for key in common_required if key not in spec] 

184 if missing_common: 

185 raise ValueError(f"Pillar spec #{idx} missing required keys: {', '.join(sorted(missing_common))}") 

186 

187 try: 

188 rows = int(spec["rows"]) 

189 except (TypeError, ValueError) as exc: 

190 raise ValueError(f"Pillar spec #{idx} has non-integer rows={spec['rows']!r}") from exc 

191 if rows <= 0: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 raise ValueError(f"Pillar spec #{idx} must have rows > 0 (got {rows}).") 

193 

194 source_type = spec["source_type"] 

195 if source_type == "hf_dataset": 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 required = ("dataset", "split") 

197 elif source_type == "fineweb_glob": 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true

198 required = ("file_glob",) 

199 else: 

200 raise ValueError(f"Pillar spec #{idx} has unsupported source_type={source_type!r}") 

201 

202 missing_source = [key for key in required if key not in spec] 

203 if missing_source: 

204 raise ValueError( 

205 f"Pillar spec #{idx} ({spec['name']!r}) missing source keys: {', '.join(sorted(missing_source))}" 

206 ) 

207 

208 

209def _rebalance_rows_to_min_capacity(specs): 

210 raise NotImplementedError("Use _rebalance_rows_to_min_capacity_with_probe instead.") 

211 

212 

213def _probe_hf_dataset_capacity(spec, hf_token): 

214 """Return split capacity (num_examples) without materializing the dataset.""" 

215 dataset_name = spec["dataset"] 

216 config_name = spec.get("config") 

217 split_name = spec["split"] 

218 

219 try: 

220 if config_name: 

221 builder = load_dataset_builder(dataset_name, config_name, token=hf_token) 

222 else: 

223 builder = load_dataset_builder(dataset_name, token=hf_token) 

224 except Exception: 

225 return None 

226 

227 split_info = getattr(builder.info, "splits", {}).get(split_name) 

228 if split_info is None: 

229 return None 

230 num_examples = getattr(split_info, "num_examples", None) 

231 if num_examples is None: 

232 return None 

233 try: 

234 return int(num_examples) 

235 except (TypeError, ValueError): 

236 return None 

237 

238 

239def _rebalance_rows_to_min_capacity_with_probe(specs, hf_token=None, probe_capacities=True): 

240 """Balance row budgets using requested rows, hints, and optional split probing. 

241 

242 Returns: 

243 balanced_rows: int 

244 limiting: list[dict] entries for pillars that determined the minimum 

245 details: list[dict] per-pillar effective limits and reasons 

246 """ 

247 details = [] 

248 for spec in specs: 

249 requested_rows = int(spec["rows"]) 

250 effective_rows = requested_rows 

251 limiters = [f"requested={requested_rows}"] 

252 

253 hint = spec.get("max_rows_hint") 

254 if hint is not None: 

255 hint_int = int(hint) 

256 if hint_int < effective_rows: 256 ↛ 258line 256 didn't jump to line 258 because the condition on line 256 was always true

257 effective_rows = hint_int 

258 limiters.append(f"max_rows_hint={hint_int}") 

259 

260 if probe_capacities and spec.get("source_type") == "hf_dataset": 

261 probed = _probe_hf_dataset_capacity(spec, hf_token) 

262 if probed is not None: 262 ↛ 267line 262 didn't jump to line 267 because the condition on line 262 was always true

263 if probed < effective_rows: 263 ↛ 265line 263 didn't jump to line 265 because the condition on line 263 was always true

264 effective_rows = probed 

265 limiters.append(f"probed_split_size={probed}") 

266 

267 details.append( 

268 { 

269 "name": spec.get("name", "unknown"), 

270 "effective_rows": effective_rows, 

271 "limiters": limiters, 

272 } 

273 ) 

274 

275 balanced_rows = min(d["effective_rows"] for d in details) 

276 limiting = [d for d in details if d["effective_rows"] == balanced_rows] 

277 

278 for spec in specs: 

279 spec["rows"] = balanced_rows 

280 

281 return balanced_rows, limiting, details 

282 

283 

284def _maybe_load_kaggle_secrets(secret_names=("HF_TOKEN", "KAGGLE_USERNAME", "KAGGLE_KEY")): 

285 try: 

286 kaggle_secrets = importlib.import_module("kaggle_secrets") 

287 except ImportError: 

288 return {} 

289 

290 client = kaggle_secrets.UserSecretsClient() 

291 loaded = {} 

292 for secret_name in secret_names: 

293 if os.environ.get(secret_name): 

294 continue 

295 try: 

296 value = client.get_secret(secret_name) 

297 except Exception: 

298 value = None 

299 if value: 299 ↛ 292line 299 didn't jump to line 292 because the condition on line 299 was always true

300 os.environ[secret_name] = value 

301 loaded[secret_name] = "loaded" 

302 return loaded 

303 

304 

305def _resolve_hf_token(env_var_name): 

306 return os.environ.get(env_var_name) or os.environ.get("HUGGINGFACEHUB_API_TOKEN") 

307 

308 

309def _streaming_select_rows(stream, rows, selection): 

310 if selection == "head": 

311 records = list(itertools.islice(stream, rows)) 

312 return records, len(records) 

313 

314 if selection == "tail": 314 ↛ 322line 314 didn't jump to line 322 because the condition on line 314 was always true

315 buffer = deque(maxlen=rows) 

316 total_rows = 0 

317 for item in stream: 

318 buffer.append(item) 

319 total_rows += 1 

320 return list(buffer), total_rows 

321 

322 raise ValueError(f"Unsupported selection strategy: {selection!r}") 

323 

324 

325def _load_remote_dataset(spec, hf_token, allow_partial=False): 

326 dataset_name = spec["dataset"] 

327 config_name = spec.get("config") 

328 split_name = spec["split"] 

329 rows = int(spec["rows"]) 

330 selection = spec.get("selection", "head") 

331 

332 load_kwargs = { 

333 "split": split_name, 

334 "token": hf_token, 

335 "streaming": True, 

336 } 

337 if config_name: 337 ↛ 338line 337 didn't jump to line 338 because the condition on line 337 was never true

338 stream = load_dataset(dataset_name, config_name, **load_kwargs) 

339 else: 

340 stream = load_dataset(dataset_name, **load_kwargs) 

341 

342 records, total_rows_seen = _streaming_select_rows(stream, rows, selection) 

343 actual_rows = len(records) 

344 if rows > actual_rows and not allow_partial: 

345 raise ValueError( 

346 f"Pillar {spec['name']!r} requested {rows} rows, but {dataset_name}:{split_name} only yielded {actual_rows}." 

347 ) 

348 if not records: 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 raise ValueError( 

350 f"Pillar {spec['name']!r} yielded zero rows from {dataset_name}:{split_name}." 

351 ) 

352 

353 return Dataset.from_list(records), { 

354 "resolved_split": split_name, 

355 "selection": selection, 

356 "requested_rows": int(spec["rows"]), 

357 "actual_rows": actual_rows, 

358 "total_rows_seen": total_rows_seen, 

359 "streaming": True, 

360 } 

361 

362 

363def _load_fineweb_shard(spec, hf_token, allow_partial=False): 

364 rows = int(spec["rows"]) 

365 shard_index = int(spec.get("shard_index", 0)) 

366 fs = HfFileSystem(token=hf_token) 

367 

368 parquet_files = [] 

369 tried_globs = [] 

370 candidate_globs = [spec.get("file_glob", DEFAULT_FINEWEB_GLOB)] 

371 for fallback in DEFAULT_FINEWEB_GLOB_CANDIDATES: 

372 if fallback not in candidate_globs: 

373 candidate_globs.append(fallback) 

374 

375 for pattern in candidate_globs: 

376 tried_globs.append(pattern) 

377 parquet_files = sorted(fs.glob(pattern)) 

378 if parquet_files: 

379 matched_glob = pattern 

380 break 

381 else: 

382 matched_glob = None 

383 

384 if not parquet_files: 

385 raise RuntimeError( 

386 "No FineWeb parquet files matched any known glob pattern. " 

387 f"Tried: {tried_globs}" 

388 ) 

389 

390 if shard_index < 0: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 shard_index += len(parquet_files) 

392 if shard_index < 0 or shard_index >= len(parquet_files): 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 raise ValueError( 

394 f"Shard index {shard_index} is out of range for {len(parquet_files)} files." 

395 ) 

396 

397 resolved_file = "hf://" + parquet_files[shard_index] 

398 stream = load_dataset( 

399 "parquet", 

400 data_files=resolved_file, 

401 split="train", 

402 streaming=True, 

403 token=hf_token, 

404 ) 

405 records = list(itertools.islice(stream, rows)) 

406 if not records: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 raise ValueError( 

408 f"Pillar {spec['name']!r} produced zero rows from shard {shard_index}. " 

409 "Pick a different shard or reduce --rows-per-pillar." 

410 ) 

411 if len(records) < rows and not allow_partial: 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true

412 raise ValueError( 

413 f"Pillar {spec['name']!r} requested {rows} rows, but shard only produced {len(records)}." 

414 ) 

415 return Dataset.from_list(records), { 

416 "resolved_file": resolved_file, 

417 "selection": "head", 

418 "requested_rows": rows, 

419 "actual_rows": len(records), 

420 "shard_count": len(parquet_files), 

421 "shard_index": shard_index, 

422 "matched_glob": matched_glob, 

423 "tried_globs": tried_globs, 

424 } 

425 

426 

427def _materialize_pillar(spec, output_dir, hf_token, allow_partial=False): 

428 source_type = spec["source_type"] 

429 if source_type == "hf_dataset": 

430 dataset, source_meta = _load_remote_dataset(spec, hf_token, allow_partial=allow_partial) 

431 elif source_type == "fineweb_glob": 

432 dataset, source_meta = _load_fineweb_shard(spec, hf_token, allow_partial=allow_partial) 

433 else: 

434 raise ValueError(f"Unsupported source_type: {source_type!r}") 

435 

436 pillar_dir = Path(output_dir) / spec["output_subdir"] 

437 pillar_dir.mkdir(parents=True, exist_ok=True) 

438 out_path = pillar_dir / spec["output_file"] 

439 dataset.to_parquet(str(out_path)) 

440 

441 return { 

442 "name": spec["name"], 

443 "path": str(out_path), 

444 "rows": int(dataset.num_rows), 

445 "source_type": source_type, 

446 "source": {k: v for k, v in spec.items() if k not in {"output_subdir", "output_file"}}, 

447 "source_meta": source_meta, 

448 "notes": spec.get("notes", ""), 

449 } 

450 

451 

452def _write_manifest(output_dir, manifest): 

453 manifest_path = Path(output_dir) / "validation_manifest.json" 

454 with open(manifest_path, "w", encoding="utf-8") as handle: 

455 json.dump(manifest, handle, indent=2) 

456 return manifest_path 

457 

458 

459def _write_kaggle_metadata(output_dir, dataset_id, title, license_name="other"): 

460 metadata = { 

461 "title": title, 

462 "id": dataset_id, 

463 "licenses": [{"name": license_name}], 

464 } 

465 metadata_path = Path(output_dir) / "dataset-metadata.json" 

466 with open(metadata_path, "w", encoding="utf-8") as handle: 

467 json.dump(metadata, handle, indent=2) 

468 return metadata_path 

469 

470 

471def _run_kaggle_cli(output_dir, *, update_existing, dir_mode, message=None): 

472 base_cmd = ["kaggle", "datasets"] 

473 if update_existing: 

474 cmd = base_cmd + ["version", "-p", str(output_dir), "--dir-mode", dir_mode] 

475 if message: 

476 cmd += ["-m", message] 

477 else: 

478 cmd = base_cmd + ["create", "-p", str(output_dir), "--dir-mode", dir_mode] 

479 result = subprocess.run(cmd, check=False) 

480 if result.returncode != 0: 

481 raise RuntimeError(f"Kaggle CLI command failed with exit code {result.returncode}: {' '.join(cmd)}") 

482 

483 

484def parse_args(): 

485 parser = argparse.ArgumentParser(description="Build a portable validation dataset bundle.") 

486 parser.add_argument("--output-dir", default="val_data", help="Output directory for parquet files and manifest.") 

487 parser.add_argument( 

488 "--preset", 

489 default=DEFAULT_PRESET, 

490 choices=sorted(PRESET_REGISTRY), 

491 help="Built-in source plan to use when --config-file is not provided.", 

492 ) 

493 parser.add_argument( 

494 "--config-file", 

495 help="Optional JSON config overriding the built-in preset. Accepts a list or {'pillars': [...]}.", 

496 ) 

497 parser.add_argument( 

498 "--rows-per-pillar", 

499 type=int, 

500 default=DEFAULT_ROWS_PER_PILLAR, 

501 help="Default number of rows per pillar for built-in presets.", 

502 ) 

503 parser.add_argument( 

504 "--rebalance-to-min", 

505 action=argparse.BooleanOptionalAction, 

506 default=True, 

507 help=( 

508 "Keep pillar sizes balanced by lowering all row counts to the smallest " 

509 "feasible target across selected sources." 

510 ), 

511 ) 

512 parser.add_argument( 

513 "--probe-capacities", 

514 action=argparse.BooleanOptionalAction, 

515 default=True, 

516 help=( 

517 "Probe HF dataset split capacities (num_examples) to choose a safe " 

518 "balanced row target before building pillars." 

519 ), 

520 ) 

521 parser.add_argument( 

522 "--fineweb-shard-index", 

523 type=int, 

524 default=500, 

525 help="Shard index used by the built-in FineWeb pillar.", 

526 ) 

527 parser.add_argument( 

528 "--hf-token-env", 

529 default="HF_TOKEN", 

530 help="Environment variable name that stores the Hugging Face token.", 

531 ) 

532 parser.add_argument( 

533 "--allow-partial", 

534 action=argparse.BooleanOptionalAction, 

535 default=False, 

536 help="Allow a pillar to emit fewer rows than requested if the source is smaller.", 

537 ) 

538 parser.add_argument( 

539 "--load-kaggle-secrets", 

540 action=argparse.BooleanOptionalAction, 

541 default=False, 

542 help="On Kaggle notebooks, populate missing HF/Kaggle env vars from kaggle_secrets.", 

543 ) 

544 parser.add_argument( 

545 "--upload-kaggle", 

546 action=argparse.BooleanOptionalAction, 

547 default=False, 

548 help="Create or update a Kaggle dataset after building the files.", 

549 ) 

550 parser.add_argument("--kaggle-dataset-id", help="Kaggle dataset id in the form owner/slug.") 

551 parser.add_argument( 

552 "--kaggle-title", 

553 default="Mini Mamba 1.58b Validation Set", 

554 help="Human-readable Kaggle dataset title.", 

555 ) 

556 parser.add_argument( 

557 "--kaggle-license", 

558 default="other", 

559 help="Kaggle metadata license code (for mixed external sources use 'other').", 

560 ) 

561 parser.add_argument( 

562 "--kaggle-update", 

563 action=argparse.BooleanOptionalAction, 

564 default=False, 

565 help="Use 'kaggle datasets version' instead of 'kaggle datasets create'.", 

566 ) 

567 parser.add_argument( 

568 "--kaggle-dir-mode", 

569 choices=["skip", "zip", "tar"], 

570 default="zip", 

571 help="Packaging mode forwarded to the Kaggle CLI.", 

572 ) 

573 parser.add_argument( 

574 "--kaggle-message", 

575 default="Refresh validation bundle", 

576 help="Version message used when --kaggle-update is enabled.", 

577 ) 

578 return parser.parse_args() 

579 

580 

581def main(): 

582 args = parse_args() 

583 

584 if args.load_kaggle_secrets: 

585 loaded = _maybe_load_kaggle_secrets() 

586 if loaded: 

587 print(f"Loaded missing Kaggle secrets: {', '.join(sorted(loaded))}") 

588 

589 output_dir = Path(args.output_dir) 

590 output_dir.mkdir(parents=True, exist_ok=True) 

591 

592 if args.config_file: 

593 specs = _load_specs_from_config(args.config_file, args.rows_per_pillar) 

594 preset_name = "custom" 

595 preset_description = f"Custom config loaded from {args.config_file}" 

596 else: 

597 preset = PRESET_REGISTRY[args.preset] 

598 specs = preset["builder"]( 

599 rows_per_pillar=args.rows_per_pillar, 

600 fineweb_shard_index=args.fineweb_shard_index, 

601 ) 

602 preset_name = args.preset 

603 preset_description = preset["description"] 

604 

605 _validate_specs(specs) 

606 

607 if args.rebalance_to_min: 

608 balanced_rows, limiting, _details = _rebalance_rows_to_min_capacity_with_probe( 

609 specs, 

610 hf_token=_resolve_hf_token(args.hf_token_env), 

611 probe_capacities=args.probe_capacities, 

612 ) 

613 print(f"Using balanced row target per pillar: {balanced_rows}") 

614 for entry in limiting: 

615 print(f" constrained by {entry['name']}: {', '.join(entry['limiters'])}") 

616 

617 hf_token = _resolve_hf_token(args.hf_token_env) 

618 results = [] 

619 for spec in specs: 

620 print(f"Building pillar: {spec['name']} -> {spec['output_subdir']}/{spec['output_file']}") 

621 result = _materialize_pillar(spec, output_dir, hf_token, allow_partial=args.allow_partial) 

622 print(f" wrote {result['rows']} rows to {result['path']}") 

623 results.append(result) 

624 

625 manifest = { 

626 "created_at": datetime.now(timezone.utc).isoformat(), 

627 "preset": preset_name, 

628 "preset_description": preset_description, 

629 "rows_per_pillar_default": args.rows_per_pillar, 

630 "training_alignment_notes": [ 

631 "Pretraining mix is defined in train.py, not context_config.py.", 

632 "The current pretraining recipe has 4 top-level buckets; this validation plan intentionally splits formal_logic into separate math and logic probes.", 

633 "Balanced 1:1:1:1:1 pillar weighting is an evaluation choice, not a training-mixture requirement.", 

634 "Deep-shard FineWeb sampling lowers overlap risk with sample-10BT training data but is not a formal zero-overlap proof.", 

635 ], 

636 "pillars": results, 

637 } 

638 manifest_path = _write_manifest(output_dir, manifest) 

639 print(f"Wrote manifest: {manifest_path}") 

640 

641 if args.upload_kaggle: 

642 if not args.kaggle_dataset_id: 

643 raise ValueError("--kaggle-dataset-id is required when --upload-kaggle is enabled.") 

644 metadata_path = _write_kaggle_metadata( 

645 output_dir, 

646 args.kaggle_dataset_id, 

647 args.kaggle_title, 

648 license_name=args.kaggle_license, 

649 ) 

650 print(f"Wrote Kaggle metadata: {metadata_path}") 

651 _run_kaggle_cli( 

652 output_dir, 

653 update_existing=args.kaggle_update, 

654 dir_mode=args.kaggle_dir_mode, 

655 message=args.kaggle_message, 

656 ) 

657 print("Kaggle upload completed.") 

658 

659 

660if __name__ == "__main__": 

661 main()