From 3acc3c20cb6c44a561c5b91096ce42af873e7994 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 22:20:40 +0300 Subject: [PATCH 1/3] PR 5.2: HuggingFace release packager + load_dataset smoke test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `scripts/package_hf_release.py` to generate `release/huggingface/README.md` with G12.1-compliant YAML frontmatter (pretty_name, license, language, task_categories, size_categories, tags, three configs with `default: true` on intermediate per G12.2), inlining the rewritten `release/README.md` body with HF-specific link rewrites. `--variant=instructor` packages the companion repo (G12.4) from `release/intermediate_instructor/` into a separate `release/huggingface-instructor/` upload tree. G12.3 covered by a parametrised `load_dataset()` smoke test gated on the optional `datasets` SDK. Extract shared release-packaging primitives (link rewriter, dir-safety guard, cover-image validator) into `scripts/_release_common.py`; refactor the Kaggle packager to import them. `release/kaggle/dataset-metadata.json` is byte-stable across the refactor. Delete the legacy `release/HF_DATASET_CARD.md` stub — superseded by the generated card. Gitignore `release/huggingface{,-instructor}/*` except the committed README. Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 7 +- .gitignore | 9 + release/HF_DATASET_CARD.md | 110 --- release/huggingface-instructor/README.md | 261 ++++++ release/huggingface/README.md | 277 ++++++ scripts/_release_common.py | 133 +++ scripts/package_hf_release.py | 852 +++++++++++++++++++ scripts/package_kaggle_release.py | 107 +-- tests/scripts/test_package_hf_release.py | 557 ++++++++++++ tests/scripts/test_package_kaggle_release.py | 2 +- 10 files changed, 2133 insertions(+), 182 deletions(-) delete mode 100644 release/HF_DATASET_CARD.md create mode 100644 release/huggingface-instructor/README.md create mode 100644 release/huggingface/README.md create mode 100644 scripts/_release_common.py create mode 100644 scripts/package_hf_release.py create mode 100644 tests/scripts/test_package_hf_release.py diff --git a/.agent-plan.md b/.agent-plan.md index 89a3dfe..874e7be 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -50,8 +50,11 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family - [x] PR 5.1: `scripts/generate_cover_image.py` (new) — deterministic Pillow + DejaVu Sans (bundled with matplotlib) renderer producing `release/dataset-cover-image.png` at 1280×640 (well above the 560×280 minimum, 2:1 aspect for Kaggle's header crop). Three-tier card design surfacing the cross-seed median conversion rate + LR AUC for each tier, pinned from `release/validation/validation_report.md`. Byte-identical re-runs guarded by `tests/scripts/test_generate_cover_image.py`. - [x] PR 5.1: Upload-dir assembly under `release/kaggle/` uses relative symlinks for the heavy bundle directories + cover image + LICENSE, plus a real file copy for `README.md` (rewritten on the way in so its `../` links and tree diagram render correctly on the Kaggle dataset page). `_validate_kaggle_dir_safe` refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor. `release/kaggle/*` is gitignored except for `dataset-metadata.json` itself — only the metadata is committed; the upload tree is regenerated on demand. - [x] PR 5.1: 19 new tests (`tests/scripts/test_package_kaggle_release.py` × 15, `tests/scripts/test_generate_cover_image.py` × 4): every Kaggle field constraint, schema field order parity for CSV + parquet, README rewriting (tree + `../` + validation report links), unsafe-kaggle-dir rejection, CLI rc=2 on missing release dir, byte-determinism (audit-artifact-sync), and committed-metadata-matches-fresh-regeneration sync check. 1194/1194 tests pass; ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape). -- [ ] PR 5.2: `scripts/package_hf_release.py` → `release/huggingface/README.md` with YAML configs/default/pretty_name/tags -- [ ] PR 5.2: Local `load_dataset()` smoke test; Kaggle dry-run package validation +- [x] PR 5.2: `scripts/package_hf_release.py` (new) — Hugging Face release packager. Reads each public tier's `manifest.json` under `release/`, emits `release/huggingface/README.md` with YAML frontmatter satisfying G12.1 (`pretty_name`, `license: mit`, `language: [en]`, `task_categories: [tabular-classification]`, `size_categories: [1K intermediate > advanced). + +## Intended uses + +- Teaching baseline lead-scoring on a flat snapshot. +- Teaching relational feature engineering against snapshot-safe tables. +- Teaching leakage detection (the `total_touches_all` trap is + designed to be discoverable). +- Teaching calibration, lift, P@K, value-aware ranking + (`expected_acv × P(convert)`), and cohort-shift evaluation. +- Comparing model families under a controlled DGP. + +## Out-of-scope uses + +- **Production lead scoring.** The company, product, and customers are + fictional. +- **Vendor benchmarking / paper baselines.** Difficulty tiers are + calibrated for pedagogy, not cross-paper comparability. +- **Causal-inference research that requires recovery of the true DGP.** + The instructor companion exposes the hidden graph for teaching, not + designed counterfactuals. +- **Demographic / fairness research.** v1 does not model protected + attributes. + +## Known limitations + +- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across + every tier. Difficulty is visible in AP, P@K, Brier, and value + capture. Treat AUC as a sanity check, not a difficulty signal. +- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta + is slightly negative in every tier (intro −0.0045, intermediate + −0.0072, advanced −0.0133); v1's snapshot is dominated by linear + features. v2 will inject non-linear interactions in the simulator. +- **Channel signal is weak.** Per + [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md), + out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across + all tiers and the per-channel rate spread is ≤0.05. The simulator + does not encode channel-conditional probabilities; channel-conditional + encoding is post-v1 work. +- **Cohort-shift degradation is small.** v1 has no time-of-year drift + baked in; the cohort-shift gate (G6.4) is informational and will + bite in v2. + +## Composition + +- **Entities.** Accounts, contacts, leads, touches, sessions, + sales_activities, opportunities (public); plus customers and + subscriptions (instructor only). Per-row counts per bundle live in + `manifest.json`. +- **Features.** 32 public columns grouped by analytical role in + [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md); + the per-bundle `feature_dictionary.csv` is the authoritative + machine-readable spec. +- **Label.** `converted_within_90_days` (boolean), event-derived from + the simulator. Never sampled directly. +- **Splits.** 70/15/15 train/valid/test, deterministic given seed; + recorded in `tasks/converted_within_90_days/task_manifest.json`. +- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package + version stamped in `manifest.json`. + +## Maintenance, adversarial framing, license + +We *want* the dataset to be broken. Issue templates ship under +`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as +`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships, +`docs/release/v2_decision_log.md` will track every accepted finding +and the design call that came from it. File issues at +[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge); +PRs welcome. + +| Field | Value | +|---|---| +| Generator | leadforge `1.0.0+` | +| Recipe | `b2b_saas_procurement_v1` | +| Canonical seed | 42 (cross-seed sweep: 42–46) | +| Bundle schema version | 5 | +| Format | Parquet (canonical) + CSV (convenience) | +| License | MIT — see [LICENSE](LICENSE) | + +Verify integrity with `leadforge validate `; every file +is hashed in `manifest.json`. diff --git a/release/huggingface/README.md b/release/huggingface/README.md new file mode 100644 index 0000000..cdbcf05 --- /dev/null +++ b/release/huggingface/README.md @@ -0,0 +1,277 @@ +--- +pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1)" +license: mit +language: + - en +task_categories: + - tabular-classification +size_categories: + - 1K intermediate > advanced). + +## Intended uses + +- Teaching baseline lead-scoring on a flat snapshot. +- Teaching relational feature engineering against snapshot-safe tables. +- Teaching leakage detection (the `total_touches_all` trap is + designed to be discoverable). +- Teaching calibration, lift, P@K, value-aware ranking + (`expected_acv × P(convert)`), and cohort-shift evaluation. +- Comparing model families under a controlled DGP. + +## Out-of-scope uses + +- **Production lead scoring.** The company, product, and customers are + fictional. +- **Vendor benchmarking / paper baselines.** Difficulty tiers are + calibrated for pedagogy, not cross-paper comparability. +- **Causal-inference research that requires recovery of the true DGP.** + The instructor companion exposes the hidden graph for teaching, not + designed counterfactuals. +- **Demographic / fairness research.** v1 does not model protected + attributes. + +## Known limitations + +- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across + every tier. Difficulty is visible in AP, P@K, Brier, and value + capture. Treat AUC as a sanity check, not a difficulty signal. +- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta + is slightly negative in every tier (intro −0.0045, intermediate + −0.0072, advanced −0.0133); v1's snapshot is dominated by linear + features. v2 will inject non-linear interactions in the simulator. +- **Channel signal is weak.** Per + [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md), + out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across + all tiers and the per-channel rate spread is ≤0.05. The simulator + does not encode channel-conditional probabilities; channel-conditional + encoding is post-v1 work. +- **Cohort-shift degradation is small.** v1 has no time-of-year drift + baked in; the cohort-shift gate (G6.4) is informational and will + bite in v2. + +## Composition + +- **Entities.** Accounts, contacts, leads, touches, sessions, + sales_activities, opportunities (public); plus customers and + subscriptions (instructor only). Per-row counts per bundle live in + `manifest.json`. +- **Features.** 32 public columns grouped by analytical role in + [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md); + the per-bundle `feature_dictionary.csv` is the authoritative + machine-readable spec. +- **Label.** `converted_within_90_days` (boolean), event-derived from + the simulator. Never sampled directly. +- **Splits.** 70/15/15 train/valid/test, deterministic given seed; + recorded in `tasks/converted_within_90_days/task_manifest.json`. +- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package + version stamped in `manifest.json`. + +## Maintenance, adversarial framing, license + +We *want* the dataset to be broken. Issue templates ship under +`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as +`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships, +`docs/release/v2_decision_log.md` will track every accepted finding +and the design call that came from it. File issues at +[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge); +PRs welcome. + +| Field | Value | +|---|---| +| Generator | leadforge `1.0.0+` | +| Recipe | `b2b_saas_procurement_v1` | +| Canonical seed | 42 (cross-seed sweep: 42–46) | +| Bundle schema version | 5 | +| Format | Parquet (canonical) + CSV (convenience) | +| License | MIT — see [LICENSE](LICENSE) | + +Verify integrity with `leadforge validate `; every file +is hashed in `manifest.json`. diff --git a/scripts/_release_common.py b/scripts/_release_common.py new file mode 100644 index 0000000..b1b61f5 --- /dev/null +++ b/scripts/_release_common.py @@ -0,0 +1,133 @@ +"""Shared primitives for the Phase 5 release packagers. + +Both ``scripts/package_kaggle_release.py`` (PR 5.1) and +``scripts/package_hf_release.py`` (PR 5.2) need to: + +* rewrite the ``](../foo)`` markdown links in ``release/README.md`` to + GitHub blob URLs — the README lives one level above each upload tree, + so the relative links break on Kaggle / HF and have to point at the + source repo; +* rewrite the ``](validation/validation_report.md)`` link to a GitHub + blob URL — the validation report does not ship with either upload + tree; +* refuse to assemble into ``cwd`` / the release dir / its parent / the + filesystem anchor (the assembler rmtrees children of the upload dir); +* validate the dataset cover image's dimensions against Kaggle's floor + (HF reuses the same PNG for its thumbnail). + +Lifting the four to one module keeps the rules in one place — if Kaggle +ever extends the dimension floor, both packagers pick it up. +``FieldDescriptor`` / ``ResourceSchema`` / dtype maps are deliberately +NOT extracted: HF infers schema from parquet via ``load_dataset`` and +does not need a Frictionless-shaped declaration. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Final + +from PIL import Image + +# --------------------------------------------------------------------------- +# README link rewriting +# --------------------------------------------------------------------------- + +GITHUB_BLOB_BASE: Final[str] = "https://github.com/leadforge-dev/leadforge/blob/main" + +#: Inline relative link ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)`` +#: for any markdown link that escapes the bundle root. +PARENT_RELATIVE_LINK_RE: Final[re.Pattern[str]] = re.compile(r"\]\(\.\./([^)]+)\)") + +#: The README points at ``validation/validation_report.md`` (a path +#: that lives under ``release/`` but never under either upload tree). +#: Rewrite to a GitHub blob URL so the link works on both platforms. +VALIDATION_REPORT_LINK: Final[str] = "](validation/validation_report.md)" +VALIDATION_REPORT_URL: Final[str] = f"]({GITHUB_BLOB_BASE}/release/validation/validation_report.md)" + + +def rewrite_release_links(text: str) -> str: + """Apply the platform-agnostic README rewrites. + + Both Kaggle and HF use these. Tree-block substitution is + platform-specific (each upload tree looks different) and lives in + each packager's own module. + """ + + text = PARENT_RELATIVE_LINK_RE.sub(rf"]({GITHUB_BLOB_BASE}/\1)", text) + text = text.replace(VALIDATION_REPORT_LINK, VALIDATION_REPORT_URL) + return text + + +# --------------------------------------------------------------------------- +# Cover-image validation (G11.2; HF reuses the same asset) +# --------------------------------------------------------------------------- + +#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with +#: 2:1 header / 1:1 thumbnail crops in mind. HF accepts any reasonable +#: dimension for thumbnails so the Kaggle floor is the binding one. +COVER_IMAGE_MIN_WIDTH: Final[int] = 560 +COVER_IMAGE_MIN_HEIGHT: Final[int] = 280 + + +@dataclass(frozen=True) +class ValidationError: + """One field-level validation failure.""" + + field: str + message: str + + +def validate_cover_image(path: Path) -> list[ValidationError]: + """Validate that ``path`` exists and meets the dimension floor.""" + + errors: list[ValidationError] = [] + if not path.exists(): + errors.append( + ValidationError( + field="cover_image", + message=f"cover image not found at {path}", + ) + ) + return errors + with Image.open(path) as img: + width, height = img.size + if width < COVER_IMAGE_MIN_WIDTH or height < COVER_IMAGE_MIN_HEIGHT: + errors.append( + ValidationError( + field="cover_image", + message=( + f"cover image {width}x{height} below minimum " + f"{COVER_IMAGE_MIN_WIDTH}x{COVER_IMAGE_MIN_HEIGHT}" + ), + ) + ) + return errors + + +# --------------------------------------------------------------------------- +# Upload-dir safety +# --------------------------------------------------------------------------- + + +def validate_upload_dir_safe(upload_dir: Path, release_dir: Path, *, kind: str) -> None: + """Refuse to assemble into a path that aliases something dangerous. + + The packagers replace children of ``upload_dir`` (rmtree + recopy) + so pointing it at ``cwd`` / ``release_dir`` / their parents / the + filesystem anchor would clobber unrelated content. ``kind`` is + folded into the error message so the trace says which packager + refused (``kaggle`` / ``huggingface`` / ``huggingface-instructor``). + """ + + resolved = upload_dir.resolve() + blocked = { + Path(resolved.anchor), + Path.cwd().resolve(), + release_dir.resolve(), + release_dir.resolve().parent, + } + if resolved in blocked: + raise ValueError(f"refusing to assemble into unsafe --{kind}-dir: {upload_dir}") diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py new file mode 100644 index 0000000..17a72db --- /dev/null +++ b/scripts/package_hf_release.py @@ -0,0 +1,852 @@ +#!/usr/bin/env python3 +"""Package the ``leadforge-lead-scoring-v1`` family for Hugging Face. + +PR 5.2 — second of two PRs in Phase 5 (Platform packaging) of the v1 +release roadmap. This script: + +1. Reads each public tier's ``manifest.json`` + flat CSV header under + ``release/`` and assembles a Hugging Face dataset card + (``release/huggingface/README.md``) whose YAML frontmatter satisfies + G12.1 of ``docs/release/v1_acceptance_gates.md``: ``pretty_name``, + ``license: mit``, ``language: en``, ``task_categories: + [tabular-classification]``, ``size_categories``, ``tags``, and one + ``configs`` block per public tier with ``data_files`` pointing at the + parquet task splits in the upload tree. Exactly one config carries + ``default: true`` (G12.2). +2. Reuses ``release/dataset-cover-image.png`` as the dataset thumbnail + (HF datasets accept any reasonable PNG; the Kaggle floor of + 560×280 is the binding constraint and the validator lives in + ``scripts/_release_common.py`` so both packagers share it). +3. Optionally assembles a HF-loadable upload directory under + ``release/huggingface/`` as real-file copies of the per-tier + bundles plus the rewritten README. Same lesson as PR 5.1: don't + symlink heavy bundle dirs; HF's ``datasets`` library walks the + upload tree and silently skips broken symlinks in some versions. +4. Supports ``--variant=instructor`` to package the + ``leadforge-lead-scoring-v1-instructor`` companion repo (G12.4) + from ``release/intermediate_instructor/`` into a separate + ``release/huggingface-instructor/`` tree with one config + (``intermediate``). Same code path; different defaults. + +The actual ``huggingface_hub.HfApi().upload_folder()`` call lives in +PR 7.2; this script is intentionally publish-free. ``--dry-run`` +writes the README only and skips upload-tree assembly, useful for +README iteration. + +Failed validation exits with rc=1; pre-flight errors (missing release +dir, missing tier, unsafe ``--huggingface-dir``) exit with rc=2. +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from collections.abc import Sequence +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Final + +# Make ``scripts/`` importable regardless of how this file is loaded. +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +# ``GITHUB_BLOB_BASE`` is re-exported for tests and downstream callers +# (mirrors the symbol surface of ``scripts/package_kaggle_release.py``). +from _release_common import ( # noqa: E402,F401 — must follow sys.path insert + GITHUB_BLOB_BASE, + ValidationError, + rewrite_release_links, + validate_cover_image, + validate_upload_dir_safe, +) + +# --------------------------------------------------------------------------- +# Hugging Face dataset-card YAML schema (chatgpt v2 §20, verified from +# https://huggingface.co/docs/hub/datasets-cards) +# --------------------------------------------------------------------------- + +#: Allowed ``size_categories`` tokens (HF taxonomy). Each public tier +#: has 5,000 leads (3500 train / 750 valid / 750 test) so ``1K str: + """Apply the HF-specific rewrites to a copy of the release README. + + Rewrites: + + 1. Source-repo tree diagram → upload-tree diagram (the published + README should describe what the *user* sees on HF, not the + source repo layout). + 2. ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)`` and + ``](validation/...)`` → blob URL — both via + ``rewrite_release_links`` from ``_release_common``. + + The instructor variant calls this with + ``tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK``; otherwise the + public upload tree is used. + """ + + text = readme.replace(HF_TREE_BLOCK_SOURCE, tree_block) + text = rewrite_release_links(text) + return text + + +def _validate_readme_substitution(release_dir: Path) -> list[ValidationError]: + """Guard against silent drift between the README's tree diagram and + ``HF_TREE_BLOCK_SOURCE`` (mirrors PR 5.1's Kaggle-side guard). + + Plain string ``replace()`` silently no-ops when the substring is + not found, which would publish the source-repo tree diagram on HF. + """ + + readme = release_dir / "README.md" + if not readme.exists(): + return [] + if HF_TREE_BLOCK_SOURCE not in readme.read_text(encoding="utf-8"): + return [ + ValidationError( + field="release/README.md", + message=( + "HF_TREE_BLOCK_SOURCE not found verbatim in release/README.md; " + "the source-repo tree diagram in the README has drifted from " + "the constant in scripts/package_hf_release.py — the " + "HF README rewrite will silently no-op until the README and " + "HF_TREE_BLOCK_SOURCE are reconciled." + ), + ) + ] + return [] + + +# --------------------------------------------------------------------------- +# Dataclasses — one per top-level YAML block +# +# These are typed records, not invariants. Construction is unchecked; +# callers MUST run :func:`validate_card` before relying on the metadata +# being well-formed (mirrors the discipline in PR 5.1). +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class DataFileEntry: + """One ``configs[*].data_files[]`` entry.""" + + split: str + path: str + + +@dataclass(frozen=True) +class ConfigEntry: + """One ``configs[]`` entry inside the YAML frontmatter.""" + + config_name: str + data_files: tuple[DataFileEntry, ...] + default: bool = False + + +@dataclass(frozen=True) +class HuggingFaceCard: + """Top-level HF dataset-card payload. + + ``body`` is the markdown that follows the YAML frontmatter — by + default the rewritten ``release/README.md`` (G12.1 expects the + README to BE the dataset card; the YAML lives at the top). + """ + + pretty_name: str + license: str + language: tuple[str, ...] + task_categories: tuple[str, ...] + size_categories: tuple[str, ...] + tags: tuple[str, ...] + configs: tuple[ConfigEntry, ...] + body: str + + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + + +def validate_card(card: HuggingFaceCard) -> list[ValidationError]: + """Run every HF-side check against a built ``HuggingFaceCard``. + + Mirrors ``validate_metadata`` in the Kaggle packager: collects all + field-level errors at once rather than fail-fasting. G12.1 + + G12.2 are surfaced here; G12.3 (load_dataset round-trip) is a + separate test that hits the actual HF library. + """ + + errors: list[ValidationError] = [] + + if not card.pretty_name.strip(): + errors.append(ValidationError(field="pretty_name", message="must be non-empty")) + + if card.license != DEFAULT_LICENSE: + errors.append( + ValidationError( + field="license", + message=f"expected {DEFAULT_LICENSE!r}, got {card.license!r}", + ) + ) + + if not card.language: + errors.append(ValidationError(field="language", message="must contain at least one entry")) + + if HF_TASK_CATEGORY not in card.task_categories: + errors.append( + ValidationError( + field="task_categories", + message=f"must contain {HF_TASK_CATEGORY!r}", + ) + ) + + if not card.size_categories: + errors.append( + ValidationError(field="size_categories", message="must contain at least one entry") + ) + + if not card.tags: + errors.append(ValidationError(field="tags", message="must contain at least one entry")) + + if not card.configs: + errors.append(ValidationError(field="configs", message="must contain at least one config")) + + # G12.2 — exactly one config has default: true. + defaults = [c for c in card.configs if c.default] + if len(defaults) != 1: + errors.append( + ValidationError( + field="configs", + message=( + f"exactly one config must have default: true, got {len(defaults)} " + f"({[c.config_name for c in defaults]!r})" + ), + ) + ) + + # Per-config integrity: non-empty data_files, every entry has a + # split + a path. + for i, config in enumerate(card.configs): + if not config.config_name.strip(): + errors.append( + ValidationError(field=f"configs[{i}].config_name", message="must be non-empty") + ) + if not config.data_files: + errors.append( + ValidationError( + field=f"configs[{i}].data_files", + message="must contain at least one entry", + ) + ) + continue + for j, df in enumerate(config.data_files): + if not df.split or not df.path: + errors.append( + ValidationError( + field=f"configs[{i}].data_files[{j}]", + message="each entry must declare both split and path", + ) + ) + + return errors + + +# --------------------------------------------------------------------------- +# Bundle reading + config building +# --------------------------------------------------------------------------- + + +def _load_manifest(path: Path) -> dict[str, Any]: + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError(f"manifest.json at {path} is not a JSON object") + return payload + + +def build_config_for_tier( + release_dir: Path, + *, + tier_dir: str, + config_name: str, + task: str = DEFAULT_TASK, + default: bool = False, +) -> ConfigEntry: + """Build a single ``configs[]`` entry for one tier. + + ``tier_dir`` is the directory name under the upload tree + (``intro`` / ``intermediate`` / ``advanced`` for the public + variant; ``intermediate`` for the instructor companion which + flattens ``intermediate_instructor/`` → ``intermediate/`` in the + upload tree). ``config_name`` is what users pass to + ``load_dataset(..., name=...)``. + + The function reads ``manifest.json`` to confirm the tier exists + and the task splits are declared; it does NOT fail when files are + missing on disk because the upload tree is reassembled by + :func:`assemble_upload_dir` and the manifest is the single source + of truth for what should be there. + """ + + manifest_path = release_dir / tier_dir / "manifest.json" + if not manifest_path.exists(): + raise FileNotFoundError(f"manifest.json missing for tier {tier_dir!r}: {manifest_path}") + manifest = _load_manifest(manifest_path) + if task not in manifest.get("tasks", {}): + raise ValueError( + f"task {task!r} not declared in manifest at {manifest_path}; " + f"got {sorted(manifest.get('tasks', {}))!r}" + ) + + data_files = tuple( + DataFileEntry( + split=hf_split, + path=f"{config_name}/tasks/{task}/{file_split}.parquet", + ) + for hf_split, file_split in HF_SPLIT_NAMES + ) + return ConfigEntry(config_name=config_name, data_files=data_files, default=default) + + +def build_card( + release_dir: Path, + *, + variant: str = "public", + pretty_name: str | None = None, + license_id: str = DEFAULT_LICENSE, + language: Sequence[str] = (DEFAULT_LANGUAGE,), + task_categories: Sequence[str] = (HF_TASK_CATEGORY,), + size_categories: Sequence[str] = (HF_SIZE_BUCKET_5K,), + tags: Sequence[str] = DEFAULT_TAGS, + default_config: str = DEFAULT_DEFAULT_CONFIG, + task: str = DEFAULT_TASK, + body: str | None = None, +) -> HuggingFaceCard: + """Assemble a ``HuggingFaceCard`` from the release tree. + + ``variant="public"`` builds the three-tier card pointing at + ``release/{intro,intermediate,advanced}/``; + ``variant="instructor"`` builds a single-config card pointing at + ``release/intermediate_instructor/`` (flattened to + ``intermediate/`` in the upload tree). + + When ``body`` is ``None`` (the default) we lift the contents of + ``release/README.md`` and apply the HF-specific rewrites — HF + renders the body as the dataset card so a full README there is + more useful than a curated blurb (mirrors the Kaggle packager's + description handling). + """ + + if variant == "public": + public_tiers = ( + ("intro", "intro"), + ("intermediate", "intermediate"), + ("advanced", "advanced"), + ) + configs = tuple( + build_config_for_tier( + release_dir, + tier_dir=tier_dir, + config_name=config_name, + task=task, + default=(config_name == default_config), + ) + for tier_dir, config_name in public_tiers + ) + if pretty_name is None: + pretty_name = DEFAULT_PRETTY_NAME + if body is None: + body = _hf_readme_text( + (release_dir / "README.md").read_text(encoding="utf-8"), + tree_block=HF_UPLOAD_TREE_BLOCK, + ) + elif variant == "instructor": + # Companion repo flattens ``intermediate_instructor`` → + # ``intermediate`` in the upload tree so the HF dataset slug + # ``leadforge-lead-scoring-v1-instructor`` exposes the + # familiar config name. + configs = ( + build_config_for_tier( + release_dir, + tier_dir="intermediate_instructor", + config_name="intermediate", + task=task, + default=True, + ), + ) + if pretty_name is None: + pretty_name = DEFAULT_INSTRUCTOR_PRETTY_NAME + if body is None: + body = _hf_readme_text( + (release_dir / "README.md").read_text(encoding="utf-8"), + tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK, + ) + else: + raise ValueError(f"unknown variant: {variant!r} (expected 'public' or 'instructor')") + + return HuggingFaceCard( + pretty_name=pretty_name, + license=license_id, + language=tuple(language), + task_categories=tuple(task_categories), + size_categories=tuple(size_categories), + tags=tuple(tags), + configs=configs, + body=body, + ) + + +# --------------------------------------------------------------------------- +# Rendering +# +# YAML is hand-rolled rather than dumped by PyYAML because the HF +# dataset-card YAML has a precise key order and indentation style that +# the viewer is fussy about (and PyYAML's default flow style collapses +# the configs list into a single line). Hand-rolling also makes the +# determinism contract obvious in the source. +# --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + + +def _yaml_string(value: str) -> str: + """Render a YAML string scalar. + + HF parses the frontmatter with PyYAML. We use double-quoted + strings only when the value contains characters that would change + YAML semantics (``:``, leading ``- ``, leading whitespace). Plain + scalars are used otherwise so the YAML stays readable. + """ + + if value == "": + return '""' + needs_quoting = any(c in value for c in ":#&*!|>'\"%@`") + needs_quoting = needs_quoting or value.startswith(("- ", " ", "?", "[", "{", "*", "&")) + needs_quoting = needs_quoting or value.endswith(" ") + if not needs_quoting: + return value + escaped = value.replace("\\", "\\\\").replace('"', '\\"') + return f'"{escaped}"' + + +def _yaml_list(values: Sequence[str], *, indent: int) -> list[str]: + """Render a YAML block-style list.""" + + pad = " " * indent + return [f"{pad}- {_yaml_string(v)}" for v in values] + + +def render_yaml_frontmatter(card: HuggingFaceCard) -> str: + """Render the YAML frontmatter as a deterministic string. + + Order: ``pretty_name``, ``license``, ``language``, + ``task_categories``, ``size_categories``, ``tags``, ``configs``. + Tags are sorted at render time (mirrors the keyword sort in the + Kaggle packager) so order on the dataclass doesn't leak into the + rendered file. + """ + + lines: list[str] = ["---"] + lines.append(f"pretty_name: {_yaml_string(card.pretty_name)}") + lines.append(f"license: {_yaml_string(card.license)}") + lines.append("language:") + lines.extend(_yaml_list(card.language, indent=2)) + lines.append("task_categories:") + lines.extend(_yaml_list(card.task_categories, indent=2)) + lines.append("size_categories:") + lines.extend(_yaml_list(card.size_categories, indent=2)) + lines.append("tags:") + lines.extend(_yaml_list(sorted(card.tags), indent=2)) + lines.append("configs:") + for config in card.configs: + lines.append(f" - config_name: {_yaml_string(config.config_name)}") + if config.default: + lines.append(" default: true") + lines.append(" data_files:") + for df in config.data_files: + lines.append(f" - split: {_yaml_string(df.split)}") + lines.append(f" path: {_yaml_string(df.path)}") + lines.append("---") + return "\n".join(lines) + "\n" + + +def render_card(card: HuggingFaceCard) -> str: + """Render the full README (YAML frontmatter + markdown body).""" + + body = card.body + if not body.endswith("\n"): + body += "\n" + return render_yaml_frontmatter(card) + "\n" + body + + +# --------------------------------------------------------------------------- +# Upload-directory assembly +# --------------------------------------------------------------------------- + + +def _replace_file(src: Path, dst: Path) -> None: + """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``.""" + + if dst.is_symlink() or dst.is_file(): + dst.unlink() + elif dst.exists() and dst.is_dir(): + shutil.rmtree(dst) + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + + +def _replace_dir(src: Path, dst: Path) -> None: + """Copy directory ``src`` → ``dst``, replacing any existing entry.""" + + if dst.is_symlink() or dst.is_file(): + dst.unlink() + elif dst.exists() and dst.is_dir(): + shutil.rmtree(dst) + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(src, dst) + + +def assemble_upload_dir( + release_dir: Path, + upload_dir: Path, + *, + variant: str = "public", + cover_image: Path = DEFAULT_COVER_IMAGE, +) -> None: + """Assemble ``upload_dir`` for ``huggingface_hub.upload_folder()``. + + Mirrors PR 5.1's Kaggle assembler: real-file copies of the per- + tier bundles + cover image + LICENSE + the rewritten README. No + symlinks (the ``datasets`` library walks the upload tree and silent + skips broken symlinks in some versions). + + For ``variant="instructor"``, the source directory + ``intermediate_instructor/`` is flattened to ``intermediate/`` in + the upload tree so ``load_dataset(..., name="intermediate")`` + works against the companion repo without naming friction. + """ + + kind = "huggingface" if variant == "public" else "huggingface-instructor" + validate_upload_dir_safe(upload_dir, release_dir, kind=kind) + upload_dir.mkdir(parents=True, exist_ok=True) + + # Cover image — reuse the one ``release/`` already ships. + cover_src = release_dir / cover_image.name + if not cover_src.exists(): + cover_src = cover_image + if cover_src.exists(): + _replace_file(cover_src, upload_dir / COVER_IMAGE_FILENAME) + + # LICENSE — straight copy, no rewriting. + license_src = release_dir / "LICENSE" + if license_src.exists(): + _replace_file(license_src, upload_dir / "LICENSE") + + # Per-tier bundles — full directory copies. The instructor variant + # flattens its source dir name. + if variant == "public": + for tier in DEFAULT_PUBLIC_TIERS: + _replace_dir(release_dir / tier, upload_dir / tier) + elif variant == "instructor": + _replace_dir( + release_dir / "intermediate_instructor", + upload_dir / "intermediate", + ) + else: + raise ValueError(f"unknown variant: {variant!r} (expected 'public' or 'instructor')") + + +# --------------------------------------------------------------------------- +# Driver +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class PackagerOutcome: + """Return value from :func:`run_packager` — used by tests + CLI.""" + + card: HuggingFaceCard + readme_path: Path + errors: tuple[ValidationError, ...] + assembled: bool + + +def run_packager( + release_dir: Path, + *, + huggingface_dir: Path = DEFAULT_HUGGINGFACE_DIR, + variant: str = "public", + pretty_name: str | None = None, + default_config: str = DEFAULT_DEFAULT_CONFIG, + task: str = DEFAULT_TASK, + cover_image: Path = DEFAULT_COVER_IMAGE, + dry_run: bool = False, +) -> PackagerOutcome: + """Build, validate, and write the HF dataset card. + + With ``dry_run=False`` (the default) the packager additionally + assembles the HF-loadable upload directory under + ``huggingface_dir`` (real-file copies of the per-tier bundles + + cover image + LICENSE). ``dry_run=True`` skips the assembly step + entirely — useful for fast README iteration. + """ + + if not release_dir.exists(): + raise FileNotFoundError(f"release directory not found: {release_dir}") + + card = build_card( + release_dir, + variant=variant, + pretty_name=pretty_name, + default_config=default_config, + task=task, + ) + + errors: list[ValidationError] = [] + errors.extend(validate_card(card)) + errors.extend(validate_cover_image(cover_image)) + errors.extend(_validate_readme_substitution(release_dir)) + + readme_path = huggingface_dir / "README.md" + readme_path.parent.mkdir(parents=True, exist_ok=True) + readme_path.write_text(render_card(card), encoding="utf-8") + + if not dry_run: + assemble_upload_dir( + release_dir, + huggingface_dir, + variant=variant, + cover_image=cover_image, + ) + + return PackagerOutcome( + card=card, + readme_path=readme_path, + errors=tuple(errors), + assembled=not dry_run, + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _parse_args(argv: Sequence[str] | None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate and validate Hugging Face README.md for leadforge-lead-scoring-v1.", + ) + parser.add_argument( + "--release-dir", + type=Path, + default=DEFAULT_RELEASE_DIR, + help="release bundle root containing one subdirectory per tier (default: %(default)s)", + ) + parser.add_argument( + "--huggingface-dir", + type=Path, + default=None, + help=( + "output directory for README.md + assembled upload tree " + "(default: release/huggingface for variant=public, " + "release/huggingface-instructor for variant=instructor)" + ), + ) + parser.add_argument( + "--variant", + choices=("public", "instructor"), + default="public", + help="public (3-tier) or instructor (companion repo); default: %(default)s", + ) + parser.add_argument( + "--default-config", + default=DEFAULT_DEFAULT_CONFIG, + help=( + "config that carries default: true (G12.2; " + "default: %(default)s for public; ignored for instructor)" + ), + ) + parser.add_argument( + "--owner", + default=DEFAULT_OWNER, + help=( + "HF dataset owner — currently informational; PR 7.2 will " + "consume it via ``huggingface_hub.HfApi().upload_folder``. " + "default: %(default)s" + ), + ) + parser.add_argument( + "--dataset-slug", + default=None, + help=( + "HF dataset slug — currently informational; PR 7.2 will " + "consume it. defaults: leadforge-lead-scoring-v1 (public) / " + "leadforge-lead-scoring-v1-instructor (instructor)" + ), + ) + parser.add_argument( + "--cover-image", + type=Path, + default=DEFAULT_COVER_IMAGE, + help="path to the dataset cover image (default: %(default)s)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="validate + write README only; skip assembling the upload directory", + ) + return parser.parse_args(argv) + + +def main(argv: Sequence[str] | None = None) -> int: + args = _parse_args(argv) + variant: str = args.variant + huggingface_dir: Path = args.huggingface_dir or ( + DEFAULT_HUGGINGFACE_DIR if variant == "public" else DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR + ) + + try: + outcome = run_packager( + args.release_dir, + huggingface_dir=huggingface_dir, + variant=variant, + default_config=args.default_config, + cover_image=args.cover_image, + dry_run=args.dry_run, + ) + except FileNotFoundError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + except ValueError as exc: + print(f"error: {exc}", file=sys.stderr) + return 2 + + if outcome.errors: + print("validation failed:", file=sys.stderr) + for err in outcome.errors: + print(f" - {err.field}: {err.message}", file=sys.stderr) + return 1 + + print(f"wrote {outcome.readme_path}", file=sys.stderr) + if outcome.assembled: + print(f"assembled upload tree under {huggingface_dir}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py index 96065a5..b8221c0 100644 --- a/scripts/package_kaggle_release.py +++ b/scripts/package_kaggle_release.py @@ -53,7 +53,30 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq -from PIL import Image + +# Make ``scripts/`` importable regardless of how this file is loaded +# (CLI ``python scripts/package_kaggle_release.py``, or +# ``importlib.util.spec_from_file_location`` from the test suite). +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +# Shared release-packaging primitives extracted in PR 5.2 so the HF +# packager can reuse the link-rewriter, the dir-safety guard, and the +# cover-image validator without duplicating logic. +from _release_common import ( # noqa: E402 — must follow sys.path insert + GITHUB_BLOB_BASE, + ValidationError, + validate_cover_image, + validate_upload_dir_safe, +) +from _release_common import ( + PARENT_RELATIVE_LINK_RE as _PARENT_RELATIVE_LINK, +) +from _release_common import ( + VALIDATION_REPORT_LINK as _VALIDATION_REPORT_LINK, +) +from _release_common import ( + VALIDATION_REPORT_URL as _VALIDATION_REPORT_URL, +) # --------------------------------------------------------------------------- # Kaggle field constraints (chatgpt v2 §19, verified from official docs) @@ -75,11 +98,6 @@ "hourly", ) -#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with -#: 2:1 header / 1:1 thumbnail crops in mind. -COVER_IMAGE_MIN_WIDTH: Final[int] = 560 -COVER_IMAGE_MIN_HEIGHT: Final[int] = 280 - #: Allowed cover-image extensions per Kaggle docs. ALLOWED_COVER_IMAGE_SUFFIXES: Final[tuple[str, ...]] = ( ".png", @@ -190,8 +208,12 @@ class UserSource: # --------------------------------------------------------------------------- # Description / README rewriting # --------------------------------------------------------------------------- - -GITHUB_BLOB_BASE: Final[str] = "https://github.com/leadforge-dev/leadforge/blob/main" +# +# ``GITHUB_BLOB_BASE``, the ``](../foo)`` regex, and the validation- +# report link rewrite live in ``_release_common`` (PR 5.2). Tree-block +# substitution stays local because the upload tree differs per +# platform (Kaggle ships ``dataset-metadata.json`` + cover image at +# the top; HF does not). #: The "What's inside" tree diagram in ``release/README.md``. The #: published README on Kaggle should describe the *upload* layout @@ -227,19 +249,6 @@ class UserSource: └── LICENSE ```""" -#: Inline relative link ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)`` -#: for any markdown link that escapes the bundle root. -_PARENT_RELATIVE_LINK: Final[re.Pattern[str]] = re.compile(r"\]\(\.\./([^)]+)\)") - -#: The README points at ``validation/validation_report.md`` (a path -#: that lives under ``release/`` but not under the Kaggle upload -#: directory). Rewrite to a GitHub blob URL so the link works on -#: Kaggle. -_VALIDATION_REPORT_LINK: Final[str] = "](validation/validation_report.md)" -_VALIDATION_REPORT_URL: Final[str] = ( - f"]({GITHUB_BLOB_BASE}/release/validation/validation_report.md)" -) - def _kaggle_readme_text(readme: str) -> str: """Apply the Kaggle-specific rewrites to a copy of the release README. @@ -338,17 +347,12 @@ class DatasetMetadata: # --------------------------------------------------------------------------- # Validation +# +# ``ValidationError`` is imported from ``_release_common`` (PR 5.2); +# the dataclass is identical to what lived here in PR 5.1. # --------------------------------------------------------------------------- -@dataclass(frozen=True) -class ValidationError: - """One field-level validation failure.""" - - field: str - message: str - - def _validate_length(name: str, value: str, lo: int, hi: int) -> ValidationError | None: n = len(value) if n < lo or n > hi: @@ -470,33 +474,6 @@ def validate_metadata(metadata: DatasetMetadata) -> list[ValidationError]: return errors -def validate_cover_image(path: Path) -> list[ValidationError]: - """Validate that ``path`` exists and meets Kaggle's dimension floor.""" - - errors: list[ValidationError] = [] - if not path.exists(): - errors.append( - ValidationError( - field="cover_image", - message=f"cover image not found at {path}", - ) - ) - return errors - with Image.open(path) as img: - width, height = img.size - if width < COVER_IMAGE_MIN_WIDTH or height < COVER_IMAGE_MIN_HEIGHT: - errors.append( - ValidationError( - field="cover_image", - message=( - f"cover image {width}x{height} below Kaggle minimum " - f"{COVER_IMAGE_MIN_WIDTH}x{COVER_IMAGE_MIN_HEIGHT}" - ), - ) - ) - return errors - - # --------------------------------------------------------------------------- # Bundle reading + resource building # --------------------------------------------------------------------------- @@ -811,21 +788,13 @@ def render_metadata_json(metadata: DatasetMetadata) -> str: def _validate_kaggle_dir_safe(kaggle_dir: Path, release_dir: Path) -> None: """Refuse to assemble into a path that aliases something dangerous. - The packager replaces children of ``kaggle_dir`` (rmtree + recopy) - so pointing it at ``cwd`` / ``release_dir`` / their parents / the - filesystem anchor would clobber unrelated content. This guard - fires before any disk write. + Thin wrapper that forwards to ``_release_common.validate_upload_dir_safe`` + with ``kind="kaggle"`` so the error message identifies which packager + refused. The standalone function is kept for the test in + ``tests/scripts/test_package_kaggle_release.py`` that imports it directly. """ - resolved = kaggle_dir.resolve() - blocked = { - Path(resolved.anchor), - Path.cwd().resolve(), - release_dir.resolve(), - release_dir.resolve().parent, - } - if resolved in blocked: - raise ValueError(f"refusing to assemble into unsafe --kaggle-dir: {kaggle_dir}") + validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle") def _replace_file(src: Path, dst: Path) -> None: diff --git a/tests/scripts/test_package_hf_release.py b/tests/scripts/test_package_hf_release.py new file mode 100644 index 0000000..1328c8d --- /dev/null +++ b/tests/scripts/test_package_hf_release.py @@ -0,0 +1,557 @@ +"""Tests for ``scripts/package_hf_release.py``. + +Locks the Phase 5 Hugging Face packaging contract: + +* every YAML field surfaced in chatgpt v2 §20 + G12.1 (pretty_name, + license, language, task_categories, size_categories, tags, configs + with data_files) +* G12.2 — exactly one config has ``default: true`` +* G12.3 — local ``load_dataset()`` succeeds for every config and + returns the expected row counts (gated on the optional ``datasets`` + package; ``pytest.importorskip`` if absent) +* G12.4 — instructor companion variant packages independently and + loads via ``load_dataset()`` +* the README link-rewriting that lets the published dataset card on + HF keep working ``../`` links (rewritten to GitHub blob URLs) and a + directory diagram that reflects the upload layout, plus a guard + that the source ``HF_TREE_BLOCK_SOURCE`` is still present verbatim + in the README (silent-failure trap; mirrors PR 5.1's KAGGLE block) +* the assembled upload tree resolves every declared resource path +* the safety net that refuses to assemble into ``cwd`` / + ``release_dir`` / its parent +* byte-equality between the committed + ``release/huggingface/README.md`` and a fresh regeneration + (audit-artifact-sync pattern from PR 4.1 / PR 5.1) +""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + +import pytest +import yaml + +_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "package_hf_release.py" +_REPO_ROOT = Path(__file__).resolve().parents[2] +_spec = importlib.util.spec_from_file_location("package_hf_release", _SCRIPT_PATH) +assert _spec is not None +assert _spec.loader is not None +packager = importlib.util.module_from_spec(_spec) +sys.modules["package_hf_release"] = packager +_spec.loader.exec_module(packager) + + +_RELEASE_DIR = _REPO_ROOT / "release" +_RELEASE_BUNDLES_PRESENT = (_RELEASE_DIR / "intro" / "manifest.json").exists() +_INSTRUCTOR_BUNDLE_PRESENT = (_RELEASE_DIR / "intermediate_instructor" / "manifest.json").exists() +_COMMITTED_README = _REPO_ROOT / "release" / "huggingface" / "README.md" +_COMMITTED_COVER = _REPO_ROOT / "release" / "dataset-cover-image.png" + + +# Canonical task-split row counts per public tier (3500 / 750 / 750) +# — pinned so the load_dataset() smoke test fails loud rather than +# silently accept an empty / truncated parquet. +_EXPECTED_ROWS = {"train": 3500, "validation": 750, "test": 750} + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _minimal_card() -> packager.HuggingFaceCard: + """A minimum-viable ``HuggingFaceCard`` that should validate cleanly.""" + + return packager.HuggingFaceCard( + pretty_name=packager.DEFAULT_PRETTY_NAME, + license=packager.DEFAULT_LICENSE, + language=(packager.DEFAULT_LANGUAGE,), + task_categories=(packager.HF_TASK_CATEGORY,), + size_categories=(packager.HF_SIZE_BUCKET_5K,), + tags=packager.DEFAULT_TAGS, + configs=( + packager.ConfigEntry( + config_name="intermediate", + data_files=( + packager.DataFileEntry( + split="train", + path="intermediate/tasks/converted_within_90_days/train.parquet", + ), + ), + default=True, + ), + ), + body="# LeadForge\n", + ) + + +# --------------------------------------------------------------------------- +# Field-constraint validation (G12.1) +# --------------------------------------------------------------------------- + + +def test_validate_card_accepts_minimal_canonical_card() -> None: + assert packager.validate_card(_minimal_card()) == [] + + +def test_validate_card_reports_every_field_violation() -> None: + """One bad card triggers every field check at once.""" + + bad = packager.HuggingFaceCard( + pretty_name="", + license="apache-2.0", # not 'mit' + language=(), + task_categories=("image-classification",), # missing tabular-classification + size_categories=(), + tags=(), + configs=(), + body="", + ) + errors = packager.validate_card(bad) + fields = {e.field for e in errors} + assert "pretty_name" in fields + assert "license" in fields + assert "language" in fields + assert "task_categories" in fields + assert "size_categories" in fields + assert "tags" in fields + assert "configs" in fields + + +def test_validate_card_requires_exactly_one_default(tmp_path: Path) -> None: + """G12.2 — exactly one config carries ``default: true``.""" + + base = _minimal_card() + + # Zero defaults. + zero = packager.HuggingFaceCard( + **{ + **base.__dict__, + "configs": ( + packager.ConfigEntry( + config_name="intro", + data_files=(packager.DataFileEntry(split="train", path="intro/x.parquet"),), + default=False, + ), + ), + } + ) + errors = packager.validate_card(zero) + assert any("default: true" in e.message for e in errors) + + # Two defaults. + two = packager.HuggingFaceCard( + **{ + **base.__dict__, + "configs": ( + packager.ConfigEntry( + config_name="intro", + data_files=(packager.DataFileEntry(split="train", path="intro/x.parquet"),), + default=True, + ), + packager.ConfigEntry( + config_name="intermediate", + data_files=( + packager.DataFileEntry(split="train", path="intermediate/x.parquet"), + ), + default=True, + ), + ), + } + ) + errors = packager.validate_card(two) + assert any("default: true" in e.message for e in errors) + + +def test_validate_card_flags_data_files_without_split_or_path() -> None: + """Each ``data_files[]`` entry must declare both split and path.""" + + base = _minimal_card() + broken = packager.ConfigEntry( + config_name="intermediate", + data_files=(packager.DataFileEntry(split="", path="intermediate/x.parquet"),), + default=True, + ) + bad = packager.HuggingFaceCard(**{**base.__dict__, "configs": (broken,)}) + errors = packager.validate_card(bad) + assert any("split and path" in e.message for e in errors) + + +# --------------------------------------------------------------------------- +# YAML rendering — round-trip parse + content invariants +# --------------------------------------------------------------------------- + + +def test_render_yaml_frontmatter_round_trips_through_pyyaml() -> None: + """The rendered YAML must parse cleanly via ``yaml.safe_load``. + + HF parses dataset-card frontmatter with PyYAML; if our hand-rolled + renderer drifts from valid YAML, the dataset card silently drops + its metadata on the HF page. + """ + + card = _minimal_card() + yaml_text = packager.render_yaml_frontmatter(card) + # ``render_yaml_frontmatter`` includes the leading and trailing + # ``---`` markers. Strip them before feeding to safe_load. + inner = yaml_text.strip().strip("-").strip() + parsed = yaml.safe_load(inner) + assert parsed["pretty_name"] == card.pretty_name + assert parsed["license"] == "mit" + assert parsed["language"] == ["en"] + assert parsed["task_categories"] == [packager.HF_TASK_CATEGORY] + assert parsed["size_categories"] == [packager.HF_SIZE_BUCKET_5K] + assert parsed["tags"] == sorted(card.tags) + assert len(parsed["configs"]) == 1 + assert parsed["configs"][0]["default"] is True + + +def test_render_yaml_tags_sorted_at_render_time() -> None: + """Tags are sorted in the rendered YAML regardless of dataclass order.""" + + base = _minimal_card() + shuffled = packager.HuggingFaceCard( + **{**base.__dict__, "tags": ("zebra", "alpha", "mango")}, + ) + rendered = packager.render_yaml_frontmatter(shuffled) + # Tag block lines, in the rendered order. + tag_lines = [ + line.strip().lstrip("- ") for line in rendered.splitlines() if line.startswith(" - ") + ] + # The first three list-item lines are language, task_categories, + # size_categories — find tag block by position after ``tags:``. + lines = rendered.splitlines() + tags_idx = lines.index("tags:") + block: list[str] = [] + for line in lines[tags_idx + 1 :]: + if line.startswith(" - "): + block.append(line[4:].strip()) + else: + break + assert block == ["alpha", "mango", "zebra"] + _ = tag_lines # silence linter + + +# --------------------------------------------------------------------------- +# README rewriting + content +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_hf_readme_text_rewrites_links_and_tree_diagram() -> None: + readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8") + rewritten = packager._hf_readme_text(readme) + + # Source-repo tree → upload tree. + assert "intermediate_instructor/" not in rewritten + assert "notebooks/01_baseline_lead_scoring.ipynb" not in rewritten + assert "README.md # this file (HF dataset card)" in rewritten + + # Relative ../ links rewritten to GitHub blob URLs. + assert "](../" not in rewritten + assert packager.GITHUB_BLOB_BASE in rewritten + + # The validation-report link (which lives under release/, not under + # the upload dir) must point at GitHub. + assert "](validation/validation_report.md)" not in rewritten + assert f"]({packager.GITHUB_BLOB_BASE}/release/validation/validation_report.md)" in rewritten + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_hf_tree_block_source_present_in_release_readme() -> None: + """Silent-failure guard. + + ``_hf_readme_text`` substitutes ``HF_TREE_BLOCK_SOURCE`` → + ``HF_UPLOAD_TREE_BLOCK`` via plain string replace. If anyone + tweaks the README's tree diagram by even one whitespace + character, the substitution silently no-ops and the published HF + dataset card carries the source-repo tree. This guard fires + loudly the moment the constants drift apart. + """ + + readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8") + assert packager.HF_TREE_BLOCK_SOURCE in readme, ( + "scripts/package_hf_release.py HF_TREE_BLOCK_SOURCE no longer matches " + "the tree diagram in release/README.md — reconcile the two before " + "the next HF README regeneration." + ) + + +def test_validate_readme_substitution_flags_drift(tmp_path: Path) -> None: + """``_validate_readme_substitution`` is wired into the run-time + validator, not just the static guard above.""" + + fake_release = tmp_path / "release" + fake_release.mkdir() + (fake_release / "README.md").write_text("# Some unrelated README\n", encoding="utf-8") + errors = packager._validate_readme_substitution(fake_release) + assert errors + assert errors[0].field == "release/README.md" + assert "HF_TREE_BLOCK_SOURCE" in errors[0].message + + if _RELEASE_BUNDLES_PRESENT: + # Sanity: the real release README does NOT trigger the validator. + assert packager._validate_readme_substitution(_RELEASE_DIR) == [] + + +# --------------------------------------------------------------------------- +# Upload-dir assembly safety + idempotence +# --------------------------------------------------------------------------- + + +def test_assemble_upload_dir_rejects_unsafe_huggingface_dir(tmp_path: Path) -> None: + """Refuse to assemble into the release dir or its parent.""" + + fake_release = tmp_path / "release" + fake_release.mkdir() + with pytest.raises(ValueError, match="unsafe"): + packager.assemble_upload_dir(fake_release, fake_release) + with pytest.raises(ValueError, match="unsafe"): + packager.assemble_upload_dir(fake_release, fake_release.parent) + + +def test_assemble_upload_dir_rejects_huggingface_dir_equal_to_cwd( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Refuse to assemble into the current working directory. + + A user passing ``--huggingface-dir .`` would otherwise rmtree-then- + recopy arbitrary cwd contents. Same safety case as the Kaggle + packager (PR 5.1 fix-up). + """ + + fake_release = tmp_path / "release" + fake_release.mkdir() + cwd = tmp_path / "workdir" + cwd.mkdir() + monkeypatch.chdir(cwd) + with pytest.raises(ValueError, match="unsafe"): + packager.assemble_upload_dir(fake_release, cwd) + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_assembled_upload_dir_resolves_every_declared_data_file(tmp_path: Path) -> None: + """Every ``configs[*].data_files[*].path`` declared in the YAML + must resolve to a real file (not a symlink, not a missing path) + under the assembled upload directory. HF's ``datasets`` library + walks the directory at upload time; a declared path that doesn't + materialise is a silent load-time failure. + """ + + upload_dir = tmp_path / "huggingface" + outcome = packager.run_packager( + _RELEASE_DIR, + huggingface_dir=upload_dir, + cover_image=_COMMITTED_COVER, + ) + + for config in outcome.card.configs: + for df in config.data_files: + target = upload_dir / df.path + assert target.is_file(), f"declared data_file missing from upload tree: {df.path}" + assert not target.is_symlink(), ( + f"declared data_file is a symlink, not a real file: {df.path} — " + f"datasets library may skip symlinked entries on upload" + ) + + # Top-level required artefacts. + assert (upload_dir / "README.md").is_file() + assert (upload_dir / packager.COVER_IMAGE_FILENAME).is_file() + assert (upload_dir / "LICENSE").is_file() + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_idempotent_against_existing_tree(tmp_path: Path) -> None: + """Re-running the assembly over an already-populated upload tree + succeeds — both passes call the same copy helpers.""" + + upload_dir = tmp_path / "huggingface" + packager.run_packager(_RELEASE_DIR, huggingface_dir=upload_dir, cover_image=_COMMITTED_COVER) + outcome = packager.run_packager( + _RELEASE_DIR, huggingface_dir=upload_dir, cover_image=_COMMITTED_COVER + ) + assert outcome.errors == () + for config in outcome.card.configs: + for df in config.data_files: + assert (upload_dir / df.path).is_file() + + +# --------------------------------------------------------------------------- +# CLI driver — error paths +# --------------------------------------------------------------------------- + + +def test_main_reports_missing_release_dir( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + rc = packager.main( + [ + "--release-dir", + str(tmp_path / "missing"), + "--huggingface-dir", + str(tmp_path / "hf"), + "--cover-image", + str(tmp_path / "cover.png"), + "--dry-run", + ] + ) + captured = capsys.readouterr() + assert rc == 2 + assert "release directory not found" in captured.err + + +# --------------------------------------------------------------------------- +# Determinism + sync with committed artefact +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_readme_is_byte_deterministic(tmp_path: Path) -> None: + """Two back-to-back runs against the committed bundles must produce + byte-identical README files.""" + + out_a = tmp_path / "a" + out_b = tmp_path / "b" + packager.run_packager( + _RELEASE_DIR, huggingface_dir=out_a, cover_image=_COMMITTED_COVER, dry_run=True + ) + packager.run_packager( + _RELEASE_DIR, huggingface_dir=out_b, cover_image=_COMMITTED_COVER, dry_run=True + ) + assert (out_a / "README.md").read_bytes() == (out_b / "README.md").read_bytes() + + +@pytest.mark.skipif( + not (_RELEASE_BUNDLES_PRESENT and _COMMITTED_README.exists()), + reason="release bundles or committed README missing", +) +def test_committed_hf_readme_matches_fresh_regeneration(tmp_path: Path) -> None: + """A fresh HF README regeneration must match the committed + ``release/huggingface/README.md`` byte-for-byte AND have a + non-degenerate body. + + If this fails, ``release/`` drifted without re-running + ``scripts/package_hf_release.py``. Regenerate via that script + from the repo root and commit the new README alongside the bundle + change. + """ + + fresh_dir = tmp_path / "huggingface" + packager.run_packager( + _RELEASE_DIR, + huggingface_dir=fresh_dir, + cover_image=_COMMITTED_COVER, + dry_run=True, + ) + fresh_bytes = (fresh_dir / "README.md").read_bytes() + committed_bytes = _COMMITTED_README.read_bytes() + assert fresh_bytes == committed_bytes + + # Positive content assertions — guard against the failure mode + # where a code change accidentally produces empty / minimal + # content that we then re-commit. + text = fresh_bytes.decode("utf-8") + # YAML frontmatter parses. + _, fm_text, body = text.split("---", 2) + fm = yaml.safe_load(fm_text) + assert fm["license"] == "mit" + assert fm["language"] == ["en"] + assert fm["task_categories"] == ["tabular-classification"] + assert sorted(fm["tags"]) == fm["tags"] # rendered sorted + assert len(fm["configs"]) == 3 + defaults = [c for c in fm["configs"] if c.get("default")] + assert len(defaults) == 1 + assert defaults[0]["config_name"] == "intermediate" + # Body inherited the rewritten release card content. + assert "What's inside" in body + assert "Why lead scoring matters" in body + assert "Known limitations" in body + assert "intermediate_instructor/" not in body # tree-diagram rewrite fired + assert "](../" not in body # parent-link rewrite fired + assert packager.GITHUB_BLOB_BASE in body + + +# --------------------------------------------------------------------------- +# Instructor companion (G12.4) +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _INSTRUCTOR_BUNDLE_PRESENT, reason="instructor bundle not present") +def test_run_packager_instructor_variant_packages_independently(tmp_path: Path) -> None: + """G12.4 — the instructor variant builds a separate upload tree + pointing at ``release/intermediate_instructor/``, with one config + (``intermediate``) and ``default: true`` set.""" + + upload_dir = tmp_path / "huggingface-instructor" + outcome = packager.run_packager( + _RELEASE_DIR, + huggingface_dir=upload_dir, + variant="instructor", + cover_image=_COMMITTED_COVER, + ) + assert outcome.errors == () + assert len(outcome.card.configs) == 1 + only = outcome.card.configs[0] + assert only.config_name == "intermediate" + assert only.default is True + # The intermediate_instructor source dir is flattened to + # ``intermediate/`` in the upload tree. + for df in only.data_files: + assert (upload_dir / df.path).is_file() + # Instructor body uses the instructor tree block. + body = outcome.card.body + assert "metadata/" in body # instructor metadata sibling visible + assert "9 full-horizon tables" in body + + +# --------------------------------------------------------------------------- +# load_dataset() round-trip (G12.3 + G12.4) +# +# Gated on the optional ``datasets`` library — the leadforge dev +# install does not pull it in (the transitive dependency tail is +# heavy). When present, this test exercises the actual HF loader +# against the assembled upload tree per config and asserts the row +# counts match the manifest. +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +@pytest.mark.parametrize("config_name", ["intro", "intermediate", "advanced"]) +def test_load_dataset_round_trip_per_config(tmp_path: Path, config_name: str) -> None: + """G12.3 — ``load_dataset(local_path, name=)`` succeeds for + every config and returns the canonical row counts.""" + + datasets = pytest.importorskip("datasets", reason="datasets SDK not installed") + + upload_dir = tmp_path / "huggingface" + packager.run_packager(_RELEASE_DIR, huggingface_dir=upload_dir, cover_image=_COMMITTED_COVER) + + ds = datasets.load_dataset(str(upload_dir), name=config_name) + for hf_split, expected_rows in _EXPECTED_ROWS.items(): + assert hf_split in ds, f"split {hf_split!r} missing from {config_name!r}" + assert len(ds[hf_split]) == expected_rows, ( + f"{config_name}/{hf_split}: expected {expected_rows} rows, got {len(ds[hf_split])}" + ) + + +@pytest.mark.skipif(not _INSTRUCTOR_BUNDLE_PRESENT, reason="instructor bundle not present") +def test_load_dataset_round_trip_instructor(tmp_path: Path) -> None: + """G12.4 — instructor companion loads via ``load_dataset()`` for + its single config (``intermediate``).""" + + datasets = pytest.importorskip("datasets", reason="datasets SDK not installed") + + upload_dir = tmp_path / "huggingface-instructor" + packager.run_packager( + _RELEASE_DIR, + huggingface_dir=upload_dir, + variant="instructor", + cover_image=_COMMITTED_COVER, + ) + + ds = datasets.load_dataset(str(upload_dir), name="intermediate") + for hf_split, expected_rows in _EXPECTED_ROWS.items(): + assert hf_split in ds + assert len(ds[hf_split]) == expected_rows diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py index 276d01e..351950b 100644 --- a/tests/scripts/test_package_kaggle_release.py +++ b/tests/scripts/test_package_kaggle_release.py @@ -175,7 +175,7 @@ def test_validate_cover_image_rejects_too_small_image(tmp_path: Path) -> None: errors = packager.validate_cover_image(tiny) assert errors assert errors[0].field == "cover_image" - assert "below Kaggle minimum" in errors[0].message + assert "below minimum" in errors[0].message def test_validate_cover_image_reports_missing_file(tmp_path: Path) -> None: From f2fc4a2861e9db799decdb58ad0b8a37b6fb9131 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Wed, 6 May 2026 23:37:18 +0300 Subject: [PATCH 2/3] PR 5.2 self-review fixes (Kaggle + HF packagers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fold the brutal self-review's findings back into the PR before review. Bugs: - (#1) run_packager validate→write order — both packagers wrote README/metadata on validation failure, leaving corrupt artifacts on disk that would silently get committed. Gated on `errors == ()`; added no-write tests for both packagers. - (#2) Instructor README inlined the public 3-tier README into a 1-tier dataset card. Replaced with a dedicated `INSTRUCTOR_BODY` constant that links to the public dataset and describes only the instructor-specific additions (full-horizon tables, hidden DAG, latent registry, mechanism summary). - (#3) validate_upload_dir_safe also blocks strict descendants of release_dir; `--huggingface-dir release/intro` would otherwise rmtree the intro bundle. Architecture: - (#5) Finished shared-primitives extraction: SOURCE_TREE_BLOCK, validate_readme_substitution, replace_file, replace_dir, load_manifest now live in scripts/_release_common.py. Both packagers reduced to imports. - (#6) Replaced 60-line hand-rolled YAML renderer with yaml.safe_dump + a 4-line _IndentedDumper subclass. - (#7) Removed dead --owner / --dataset-slug CLI flags. - (#8) assemble_upload_dir now takes rendered_readme and writes it. - (#9) build_config_for_tier made pure (no I/O); cheap manifest-stat preflight via _assert_tier_dir_exists. - (#10) --default-config with --variant=instructor errors loudly. CI: - (#4) Added [publish] extra (datasets>=2.14, kaggle>=1.6) so the gated G12.3 / G12.4 / G11.3 tests install in one line. Cleanups: visual cruft (#13–#16), test cruft (#17 — unused tmp_path, dead tag_lines), em-dash YAML round-trip parametrised for the instructor pretty_name. Verification: 1223 tests pass + 5 gated skips; ruff + mypy clean; hash determinism PASS 67/67; leakage probes 0/3 reconstruct on every tier; validate_release_candidate --no-rebuild exits 0. release/{kaggle,huggingface,huggingface-instructor}/dataset-metadata .json|README.md regenerated; audit-artifact-sync tests guard them. Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 2 +- pyproject.toml | 9 + release/huggingface-instructor/README.md | 283 +++------ release/huggingface/README.md | 2 +- scripts/_release_common.py | 175 +++++- scripts/package_hf_release.py | 603 ++++++++++--------- scripts/package_kaggle_release.py | 180 ++---- tests/scripts/test_package_hf_release.py | 211 +++++-- tests/scripts/test_package_kaggle_release.py | 43 +- 9 files changed, 847 insertions(+), 661 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index 874e7be..6e8c359 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -54,7 +54,7 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family - [x] PR 5.2: `scripts/_release_common.py` (new) — extracted shared release-packaging primitives so the Kaggle + HF packagers share one source of truth: `GITHUB_BLOB_BASE`, the `](../foo)` regex, the validation-report blob-URL substitution, `validate_cover_image` (dimension floor; HF reuses Kaggle's 560×280), and `validate_upload_dir_safe` (refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor). `FieldDescriptor` / `ResourceSchema` / dtype maps are intentionally NOT extracted — HF infers schema from parquet via `load_dataset` and does not need a Frictionless declaration. `scripts/package_kaggle_release.py` refactored to import these primitives; PR 5.1's external behaviour and committed `release/kaggle/dataset-metadata.json` are byte-stable across the refactor. - [x] PR 5.2: Upload-dir assembly under `release/huggingface/` and `release/huggingface-instructor/` uses real-file copies (not symlinks; mirrors PR 5.1 — the `datasets` library walks the upload tree and silently skips broken symlinks in some versions). Cover image at `release/dataset-cover-image.png` is reused for the HF thumbnail. `release/huggingface/*` and `release/huggingface-instructor/*` are gitignored except for `README.md` — only the dataset card is committed; the upload tree is reassembled on demand. - [x] PR 5.2: `release/HF_DATASET_CARD.md` (legacy single-file stub) deleted — superseded by the generated `release/huggingface/README.md`. -- [x] PR 5.2: 21 new tests (`tests/scripts/test_package_hf_release.py`): every G12.1 YAML field, G12.2 exactly-one-default invariant, YAML round-trip via `yaml.safe_load`, tags-sorted-at-render-time, README rewriting (tree + `../` + validation report links), `HF_TREE_BLOCK_SOURCE` silent-failure guard, unsafe-dir rejection (release dir / parent / `cwd`), assembled-tree resource resolution, idempotent re-assembly, byte-determinism (audit-artifact-sync), committed-README-matches-fresh-regeneration sync check, instructor-variant packaging (G12.4: single config, flattened source dir), and a parametrised `load_dataset()` round-trip per config gated on the optional `datasets` SDK with row-count assertions (3500 / 750 / 750 train / valid / test) covering G12.3 + G12.4. 1216/1216 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape). +- [x] PR 5.2 (self-review pass): brutal review of the first revision caught real bugs the reviewer would otherwise have to call out. Fixes folded into the PR before review: (#1) `run_packager` validate→write order — both packagers were writing the README/metadata even when validation failed; the validation gate now early-returns with `errors` populated and zero artifacts on disk; new tests exercise the no-write path on both sides. (#2) Instructor README was inlining the public 3-tier README for a 1-tier dataset; replaced with a dedicated `INSTRUCTOR_BODY` constant that opens by linking to the public dataset, describes only the instructor-specific additions (full-horizon tables, hidden DAG, latent registry, mechanism summary), and uses the single-tier tree block. (#3) `validate_upload_dir_safe` now also rejects strict descendants of `release_dir` (e.g. `--huggingface-dir release/intro` would otherwise rmtree the intro bundle); allow-list keeps the canonical `release/{kaggle,huggingface,huggingface-instructor}` direct-children. (#4) `[publish]` extra in `pyproject.toml` (`datasets>=2.14`, `kaggle>=1.6`) makes the gated `load_dataset()` / Kaggle-CLI tests installable in a single command — closes the "G12.3/G12.4 untested in CI" gap to a one-line install. (#5) Shared-primitives extraction finished: `SOURCE_TREE_BLOCK`, `validate_readme_substitution`, `replace_file`, `replace_dir`, `load_manifest` all moved to `scripts/_release_common.py`; both packagers reduced to imports. (#6) Hand-rolled YAML renderer (60 lines + brittle quoting heuristic) replaced with `yaml.safe_dump` + a 4-line `_IndentedDumper` subclass that forces indent-2 on top-level sequences. (#7) Dead `--owner` / `--dataset-slug` CLI flags removed (PR 7.2 will add them when actually needed). (#8) `assemble_upload_dir` now takes `rendered_readme` as a parameter and writes it itself; the public name no longer lies about producing a complete tree. (#9) `build_config_for_tier` made pure (no I/O); `_assert_tier_dir_exists` does the cheap manifest-stat preflight. (#10) `--default-config` with `--variant=instructor` now errors instead of silently ignoring. (#11) Instructor tree-diagram drops the hardcoded "9 tables" claim. (#13–#16) Visual cleanups (duplicate divider, ruff-split imports, `COVER_IMAGE_FILENAME`-vs-`Path.name` redundancy, speculative comment about HF split rename). (#17) Test cruft removed (unused `tmp_path`, dead `tag_lines`); em-dash YAML round-trip parametrised for the instructor `pretty_name`. Net: 1223/1223 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape). ### Phase 6 — Notebook sequence + adversarial framing - [ ] `release/notebooks/{02_relational_feature_engineering,03_leakage_and_time_windows,04_lift_calibration_value_ranking}.ipynb` diff --git a/pyproject.toml b/pyproject.toml index 29bacba..b7f1de8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,15 @@ scripts = [ "scikit-learn>=1.3", "matplotlib>=3.7", ] +# Optional dependencies for the platform release packagers. Installing +# this extra (``pip install -e ".[publish]"``) enables the gated +# ``load_dataset()`` / Kaggle-CLI smoke tests that verify G11.3 (Kaggle +# package) and G12.3 / G12.4 (HF load_dataset round-trip) without +# pulling the heavy SDKs into the default dev install. +publish = [ + "datasets>=2.14", + "kaggle>=1.6", +] [project.scripts] leadforge = "leadforge.cli.main:app" diff --git a/release/huggingface-instructor/README.md b/release/huggingface-instructor/README.md index 3fadd54..6725379 100644 --- a/release/huggingface-instructor/README.md +++ b/release/huggingface-instructor/README.md @@ -1,5 +1,5 @@ --- -pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion" +pretty_name: 'LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion' license: mit language: - en @@ -27,32 +27,24 @@ configs: path: intermediate/tasks/converted_within_90_days/test.parquet --- -# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`) +# LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion -A relational, reproducible, three-tier synthetic CRM dataset family for -teaching lead scoring at scale. Generated by -[leadforge](https://github.com/leadforge-dev/leadforge), an -open-source Python framework for synthetic CRM/funnel data. The -framework version is decoupled from the dataset version: the package -stays at `1.x`; the dataset is published under the explicit `…-v1` -tag. +This is the **research / instructor companion** to the public +[`leadforge/leadforge-lead-scoring-v1`](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) +dataset. It exposes the **full-horizon** view of a single difficulty +tier (`intermediate`) plus the **hidden causal structure** that the +public dataset deliberately redacts: the world graph (DAG), latent +trait registry, mechanism summary, and full-horizon relational tables +including `customers` and `subscriptions`. -## Why lead scoring matters in 2024–2026 +It exists for instructors who want to walk students through how the +public dataset was generated, and for researchers who want to verify +that the public redactions actually remove the leakage paths the +dataset advertises. **It is not a replacement for the public dataset +in any teaching or modelling context** — students should still train +on the public bundle. -Mid-market SaaS vendors entered 2024–2026 with growth slowing and -customer-acquisition costs rising[^macro], so predicting *which* leads -convert within a fixed window has moved from a marketing nicety to a -survival skill. This dataset teaches that skill on a relational -substrate, with the realistic confusions (snapshot-window discipline, -leakage traps, channel signal weaker than vendor blogs imply) that -students will hit when they finally get hands on real CRM data. - -[^macro]: Macroeconomic framing summarised in -[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md) -(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio -rose materially in 2024). - -## What's inside +## What this companion contains ``` . @@ -60,7 +52,7 @@ rose materially in 2024). │ ├── manifest.json # provenance + file hashes │ ├── dataset_card.md # auto-rendered per-bundle card │ ├── feature_dictionary.csv # authoritative column spec -│ ├── tables/*.parquet # 9 full-horizon tables (incl. customers, subscriptions) +│ ├── tables/*.parquet # full-horizon tables (incl. customers, subscriptions) │ ├── tasks/converted_within_90_days/{train,valid,test}.parquet │ └── metadata/ # world_spec, graph.{graphml,json}, latent_registry, etc. ├── README.md # this file (HF dataset card) @@ -68,183 +60,95 @@ rose materially in 2024). └── LICENSE ``` -`student_public` bundles ship the snapshot-safe relational view; -`research_instructor` companions ship the full-horizon view plus the -hidden causal structure (DAG, latent registry, mechanism summary) -under `metadata/`. The full layout is documented in each bundle's -`manifest.json`. +The single ``intermediate`` config exposes the same train/valid/test +parquet splits as the public dataset's ``intermediate`` config — same +seeds, same row counts (3,500 / 750 / 750), same target. The +difference lives in the relational tables and metadata: + +| File | Public `intermediate` | Instructor companion | +|---|---|---| +| `tables/leads.parquet` | redacted (label dropped) | full (label retained) | +| `tables/opportunities.parquet` | snapshot-filtered + redacted | full-horizon, full columns | +| `tables/customers.parquet` | omitted (would leak label) | included | +| `tables/subscriptions.parquet` | omitted (would leak label) | included | +| `tables/touches.parquet` etc. | filtered to ≤ snapshot day | full 90-day horizon | +| `metadata/world_spec.json` | absent | included (DGP + recipe) | +| `metadata/graph.{graphml,json}` | absent | included (hidden DAG) | +| `metadata/latent_registry.json` | absent | included (latent traits) | +| `metadata/mechanism_summary.json` | absent | included (per-edge mechanisms) | + +The redaction contract is single-sourced in +[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py) +and re-applied by +[`leadforge/render/relational_snapshot_safe.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/render/relational_snapshot_safe.py) +when the public bundle is built; this companion is the unfiltered +source view, so the two are always consistent by construction. ## Quick start ```python -# Flat CSV -df = pd.read_csv("intermediate/lead_scoring.csv") - -# Parquet task splits (recommended) -train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet") -test = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet") +from datasets import load_dataset -# Relational tables (feature engineering — example) -leads = pd.read_parquet("intermediate/tables/leads.parquet") -touches = pd.read_parquet("intermediate/tables/touches.parquet") -my_touch_count = ( - touches.groupby("lead_id").size().rename("my_touch_count").reset_index() +# Loads the same train/valid/test splits as the public 'intermediate' +# config; differs only in what `tables/` and `metadata/` provide. +ds = load_dataset( + "leadforge/leadforge-lead-scoring-v1-instructor", + name="intermediate", +) +train = ds["train"].to_pandas() + +# Full-horizon relational tables — includes customers and subscriptions +# (omitted from the public dataset because their existence reconstructs +# the conversion label). +import pandas as pd +customers = pd.read_parquet( + "hf://datasets/leadforge/leadforge-lead-scoring-v1-instructor/intermediate/tables/customers.parquet" ) -features = leads.merge(my_touch_count, on="lead_id", how="left") - -# Reproduce from source -# pip install leadforge -# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \ -# --mode student_public --difficulty intermediate --out my_bundle ``` -The label `converted_within_90_days` resolves over a 90-day window; -engagement features (`touch_count`, `session_count`, etc.) are -computed strictly over events on days `[0, 30]`. The deliberate -exception is `total_touches_all`, the leakage trap — flagged -`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your -feature set unless you're demonstrating leakage detection. - -## Dataset summary - -| | Intro | Intermediate | Advanced | -|---|---|---|---| -| Leads | 5,000 | 5,000 | 5,000 | -| Accounts | 1,500 | 1,500 | 1,500 | -| Contacts | 4,200 | 4,200 | 4,200 | -| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* | -| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` | -| Conversion rate (recipe band) | 24–61% | 12–31% | 4–12% | -| Conversion rate (median, seeds 42–46) | 42.67% | 21.60% | 8.40% | -| Signal strength | 0.90 | 0.70 | 0.50 | -| Noise scale | 0.10 | 0.30 | 0.55 | -| Missing rate | 2% | 8% | 18% | - -\* `student_public` / `research_instructor`. Difficulty is modulated -by the simulation engine — signal strength on latent-trait weights, -Gaussian noise on float features, MCAR missingness, outlier rate — -not post-hoc label flipping. - -## The scenario - -**Veridian Technologies** is a fictional Series B startup (Austin, US) -selling **Veridian Procure**, a procurement / AP automation SaaS, to -mid-market firms (200–2,000 employees) in the US and UK. The funnel -runs through inbound marketing (45%), SDR outbound (35%), and -partner referrals (20%); four personas drive deals (VP Finance, AP -Manager, IT Director, Procurement Manager). **Task:** predict whether -a lead converts (`closed_won`) within 90 days. ACV bands are -$18k–$120k. See -[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md) -for the full DGP, and the deeper "what's modelled / approximate / not -modelled" breakdown that this README only summarises. - -## Public vs instructor: what's redacted - -Filtering happens **during rendering**, not during simulation. The -redaction contract is single-sourced in -[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py); -the snapshot-safe writer and the validator import the same constants, -so they cannot drift apart. - -| Source-of-truth constant | Public bundle treatment | -|---|---| -| `BANNED_LEAD_COLUMNS = ("converted_within_90_days", "conversion_timestamp")` | Dropped from `tables/leads.parquet` | -| `BANNED_OPP_COLUMNS = ("close_outcome", "closed_at")` | Dropped from `tables/opportunities.parquet` | -| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles | -| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` | -| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` | -| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` | - -Each bundle's `manifest.json` records `relational_snapshot_safe`, -`redacted_columns`, and `snapshot_day`, so the bundle is -self-describing. - -## Calibration - -Every realism / calibration / difficulty claim in this README is -backed by -[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md), -regenerated by -[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py) -with bands declared in -[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml). -Headline cross-seed medians (seeds 42–46): - -| Tier | LR AUC | AP | P@100 | Brier | -|---|---|---|---|---| -| intro | 0.879 | 0.761 | 0.80 | 0.130 | -| intermediate | 0.886 | 0.575 | 0.59 | 0.110 | -| advanced | 0.886 | 0.351 | 0.34 | 0.061 | - -AP, P@100, conversion-rate, and lift orderings hold across the -intended difficulty axis (intro > intermediate > advanced). - ## Intended uses -- Teaching baseline lead-scoring on a flat snapshot. -- Teaching relational feature engineering against snapshot-safe tables. -- Teaching leakage detection (the `total_touches_all` trap is - designed to be discoverable). -- Teaching calibration, lift, P@K, value-aware ranking - (`expected_acv × P(convert)`), and cohort-shift evaluation. -- Comparing model families under a controlled DGP. +- Teaching the **public-vs-instructor split** itself: load both + datasets side-by-side, show students which columns and tables were + redacted, and walk through why each was a leakage path. +- **Verifying the redaction contract:** train a model on the + full-horizon tables, train another on the snapshot-safe public + tables, compare AUC. The gap is the redaction's effect. +- Teaching **causal structure and DGP transparency** using + `metadata/world_spec.json` + `metadata/graph.json`. +- Reproducing the public dataset from the instructor view via + [`leadforge`](https://github.com/leadforge-dev/leadforge/blob/main) source code. ## Out-of-scope uses -- **Production lead scoring.** The company, product, and customers are - fictional. -- **Vendor benchmarking / paper baselines.** Difficulty tiers are - calibrated for pedagogy, not cross-paper comparability. -- **Causal-inference research that requires recovery of the true DGP.** - The instructor companion exposes the hidden graph for teaching, not - designed counterfactuals. -- **Demographic / fairness research.** v1 does not model protected +- **Production lead scoring.** Same as the public dataset; the + company, product, and customers are fictional. +- **Modelling with the unredacted view as a baseline.** Models + trained against the full-horizon tables look strong because they're + directly seeing post-conversion events. That number is not a + baseline; it's the ceiling. +- **Demographic / fairness research.** v1 does not model protected attributes. -## Known limitations - -- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across - every tier. Difficulty is visible in AP, P@K, Brier, and value - capture. Treat AUC as a sanity check, not a difficulty signal. -- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta - is slightly negative in every tier (intro −0.0045, intermediate - −0.0072, advanced −0.0133); v1's snapshot is dominated by linear - features. v2 will inject non-linear interactions in the simulator. -- **Channel signal is weak.** Per - [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md), - out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across - all tiers and the per-channel rate spread is ≤0.05. The simulator - does not encode channel-conditional probabilities; channel-conditional - encoding is post-v1 work. -- **Cohort-shift degradation is small.** v1 has no time-of-year drift - baked in; the cohort-shift gate (G6.4) is informational and will - bite in v2. - ## Composition -- **Entities.** Accounts, contacts, leads, touches, sessions, - sales_activities, opportunities (public); plus customers and - subscriptions (instructor only). Per-row counts per bundle live in - `manifest.json`. -- **Features.** 32 public columns grouped by analytical role in - [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md); - the per-bundle `feature_dictionary.csv` is the authoritative - machine-readable spec. -- **Label.** `converted_within_90_days` (boolean), event-derived from - the simulator. Never sampled directly. -- **Splits.** 70/15/15 train/valid/test, deterministic given seed; - recorded in `tasks/converted_within_90_days/task_manifest.json`. -- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package - version stamped in `manifest.json`. - -## Maintenance, adversarial framing, license - -We *want* the dataset to be broken. Issue templates ship under -`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as -`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships, -`docs/release/v2_decision_log.md` will track every accepted finding -and the design call that came from it. File issues at +- **Entities.** 9 relational tables (accounts, contacts, leads, + touches, sessions, sales_activities, opportunities, customers, + subscriptions); per-row counts in `manifest.json`. +- **Splits.** Identical to the public `intermediate` config: 70/15/15 + train/valid/test, deterministic given seed 42, recorded in + `tasks/converted_within_90_days/task_manifest.json`. +- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package + version stamped in `manifest.json` along with SHA-256 hashes for + every parquet file. +- **Bundle schema version.** 5 (matches the public dataset). + +## Maintenance, license + +We *want* the dataset to be broken. See the +[public dataset card](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) +for the adversarial-framing pointers, the issue templates, and the +break-me guide. File issues at [leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge); PRs welcome. @@ -252,10 +156,11 @@ PRs welcome. |---|---| | Generator | leadforge `1.0.0+` | | Recipe | `b2b_saas_procurement_v1` | -| Canonical seed | 42 (cross-seed sweep: 42–46) | +| Canonical seed | 42 | | Bundle schema version | 5 | -| Format | Parquet (canonical) + CSV (convenience) | +| Format | Parquet (canonical) | | License | MIT — see [LICENSE](LICENSE) | +| Public dataset | [link](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) | -Verify integrity with `leadforge validate `; every file -is hashed in `manifest.json`. +Verify integrity with `leadforge validate `; every file is +hashed in `manifest.json`. diff --git a/release/huggingface/README.md b/release/huggingface/README.md index cdbcf05..885e834 100644 --- a/release/huggingface/README.md +++ b/release/huggingface/README.md @@ -1,5 +1,5 @@ --- -pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1)" +pretty_name: 'LeadForge: Synthetic B2B Lead Scoring (v1)' license: mit language: - en diff --git a/scripts/_release_common.py b/scripts/_release_common.py index b1b61f5..c57e71e 100644 --- a/scripts/_release_common.py +++ b/scripts/_release_common.py @@ -10,13 +10,17 @@ * rewrite the ``](validation/validation_report.md)`` link to a GitHub blob URL — the validation report does not ship with either upload tree; -* refuse to assemble into ``cwd`` / the release dir / its parent / the - filesystem anchor (the assembler rmtrees children of the upload dir); +* refuse to assemble into ``cwd`` / the release dir / its parent / a + descendant of the release dir / the filesystem anchor (the assembler + rmtrees children of the upload dir); * validate the dataset cover image's dimensions against Kaggle's floor - (HF reuses the same PNG for its thumbnail). + (HF reuses the same PNG for its thumbnail); +* perform idempotent file/dir replacement during upload-tree assembly; +* read tier ``manifest.json`` payloads; +* guard the source-side tree diagram in ``release/README.md`` against + drifting away from the platform packagers' ``replace()`` substring + (silent-failure trap). -Lifting the four to one module keeps the rules in one place — if Kaggle -ever extends the dimension floor, both packagers pick it up. ``FieldDescriptor`` / ``ResourceSchema`` / dtype maps are deliberately NOT extracted: HF infers schema from parquet via ``load_dataset`` and does not need a Frictionless-shaped declaration. @@ -24,10 +28,12 @@ from __future__ import annotations +import json import re +import shutil from dataclasses import dataclass from pathlib import Path -from typing import Final +from typing import Any, Final from PIL import Image @@ -62,15 +68,9 @@ def rewrite_release_links(text: str) -> str: # --------------------------------------------------------------------------- -# Cover-image validation (G11.2; HF reuses the same asset) +# Validation error type — declared before any validator that returns it # --------------------------------------------------------------------------- -#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with -#: 2:1 header / 1:1 thumbnail crops in mind. HF accepts any reasonable -#: dimension for thumbnails so the Kaggle floor is the binding one. -COVER_IMAGE_MIN_WIDTH: Final[int] = 560 -COVER_IMAGE_MIN_HEIGHT: Final[int] = 280 - @dataclass(frozen=True) class ValidationError: @@ -80,6 +80,72 @@ class ValidationError: message: str +# --------------------------------------------------------------------------- +# Source-side tree-diagram guard +# +# ``release/README.md`` (PR 4.1) contains a "What's inside" tree +# diagram describing the SOURCE-REPO layout. Each packager substitutes +# this block for its own upload-tree diagram via ``str.replace``. +# Keeping the source-side block as a single shared constant means the +# README's tree diagram drifting away from the constant is caught +# loudly by the same guard regardless of which packager runs first. +# --------------------------------------------------------------------------- + +SOURCE_TREE_BLOCK: Final[str] = """``` +release/ +├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier +│ ├── manifest.json # provenance + file hashes +│ ├── dataset_card.md # auto-rendered per-bundle card +│ ├── feature_dictionary.csv # authoritative column spec +│ ├── lead_scoring.csv # flat convenience CSV (all splits) +│ ├── tables/*.parquet # 7 snapshot-safe relational tables +│ └── tasks/converted_within_90_days/{train,valid,test}.parquet +├── intermediate_instructor/ # research companion: full-horizon tables + metadata/ +├── notebooks/01_baseline_lead_scoring.ipynb +└── validation/ # validation_report.{json,md} + figures +```""" + + +def validate_readme_substitution(release_dir: Path, *, packager_name: str) -> list[ValidationError]: + """Guard against silent drift between the README's tree diagram + and ``SOURCE_TREE_BLOCK``. + + Plain string ``replace()`` silently no-ops when the substring is + not found, which would publish the source-repo tree diagram on the + target platform. ``packager_name`` is folded into the error message + so the trace says which packager noticed the drift first. + """ + + readme = release_dir / "README.md" + if not readme.exists(): + return [] # No README is itself a release-day issue, but not this validator's concern. + if SOURCE_TREE_BLOCK not in readme.read_text(encoding="utf-8"): + return [ + ValidationError( + field="release/README.md", + message=( + f"SOURCE_TREE_BLOCK not found verbatim in release/README.md; " + f"the source-repo tree diagram in the README has drifted from " + f"the constant in scripts/_release_common.py — the " + f"{packager_name} README rewrite will silently no-op until " + f"the README and SOURCE_TREE_BLOCK are reconciled." + ), + ) + ] + return [] + + +# --------------------------------------------------------------------------- +# Cover-image validation (G11.2; HF reuses the same asset) +# --------------------------------------------------------------------------- + +#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with +#: 2:1 header / 1:1 thumbnail crops in mind. HF accepts any reasonable +#: dimension for thumbnails so the Kaggle floor is the binding one. +COVER_IMAGE_MIN_WIDTH: Final[int] = 560 +COVER_IMAGE_MIN_HEIGHT: Final[int] = 280 + + def validate_cover_image(path: Path) -> list[ValidationError]: """Validate that ``path`` exists and meets the dimension floor.""" @@ -116,18 +182,89 @@ def validate_upload_dir_safe(upload_dir: Path, release_dir: Path, *, kind: str) """Refuse to assemble into a path that aliases something dangerous. The packagers replace children of ``upload_dir`` (rmtree + recopy) - so pointing it at ``cwd`` / ``release_dir`` / their parents / the - filesystem anchor would clobber unrelated content. ``kind`` is - folded into the error message so the trace says which packager - refused (``kaggle`` / ``huggingface`` / ``huggingface-instructor``). + so pointing it at any of the following would clobber unrelated + content: + + * ``cwd`` or its anchor (``/`` on POSIX); + * ``release_dir`` itself; + * a parent of ``release_dir``; + * a descendant of ``release_dir`` other than the canonical platform + output dirs (e.g. ``release/intro`` would rmtree the intro + bundle). + + The safe location for an upload tree is a sibling of ``release/`` + or a fresh subdirectory of ``release/`` that the packager owns + (e.g. ``release/kaggle/``, ``release/huggingface/``). We allow + those by checking the resolved path against the explicit blocklist + and against descendant containment in ``release_dir``. + + ``kind`` is folded into the error message so the trace says which + packager refused (``kaggle`` / ``huggingface`` / + ``huggingface-instructor``). """ resolved = upload_dir.resolve() + release_resolved = release_dir.resolve() blocked = { Path(resolved.anchor), Path.cwd().resolve(), - release_dir.resolve(), - release_dir.resolve().parent, + release_resolved, + release_resolved.parent, } if resolved in blocked: raise ValueError(f"refusing to assemble into unsafe --{kind}-dir: {upload_dir}") + # Disallow descendants of release_dir other than direct children + # owned by a packager. ``resolved.parent == release_resolved`` is + # the canonical ok-case (release/kaggle, release/huggingface, + # release/huggingface-instructor); deeper nesting would alias a + # tier bundle dir. + if resolved.is_relative_to(release_resolved) and resolved.parent != release_resolved: + raise ValueError( + f"refusing to assemble into unsafe --{kind}-dir: {upload_dir} " + f"(would alias contents inside {release_dir})" + ) + + +# --------------------------------------------------------------------------- +# Filesystem helpers — idempotent file / dir replacement +# --------------------------------------------------------------------------- + + +def replace_file(src: Path, dst: Path) -> None: + """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``.""" + + if dst.is_symlink() or dst.is_file(): + dst.unlink() + elif dst.exists() and dst.is_dir(): + shutil.rmtree(dst) + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + + +def replace_dir(src: Path, dst: Path) -> None: + """Copy directory ``src`` → ``dst``, replacing any existing entry.""" + + if dst.is_symlink() or dst.is_file(): + dst.unlink() + elif dst.exists() and dst.is_dir(): + shutil.rmtree(dst) + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copytree(src, dst) + + +# --------------------------------------------------------------------------- +# Manifest reading +# --------------------------------------------------------------------------- + + +def load_manifest(path: Path) -> dict[str, Any]: + """Read a tier's ``manifest.json`` and return the parsed dict. + + Raises ``ValueError`` when the JSON parses to anything other than a + top-level object (the packagers index into it expecting a dict). + """ + + payload = json.loads(path.read_text(encoding="utf-8")) + if not isinstance(payload, dict): + raise ValueError(f"manifest.json at {path} is not a JSON object") + return payload diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py index 17a72db..382bed5 100644 --- a/scripts/package_hf_release.py +++ b/scripts/package_hf_release.py @@ -4,8 +4,8 @@ PR 5.2 — second of two PRs in Phase 5 (Platform packaging) of the v1 release roadmap. This script: -1. Reads each public tier's ``manifest.json`` + flat CSV header under - ``release/`` and assembles a Hugging Face dataset card +1. Reads each public tier's ``manifest.json`` under ``release/`` and + assembles a Hugging Face dataset card (``release/huggingface/README.md``) whose YAML frontmatter satisfies G12.1 of ``docs/release/v1_acceptance_gates.md``: ``pretty_name``, ``license: mit``, ``language: en``, ``task_categories: @@ -26,38 +26,46 @@ ``leadforge-lead-scoring-v1-instructor`` companion repo (G12.4) from ``release/intermediate_instructor/`` into a separate ``release/huggingface-instructor/`` tree with one config - (``intermediate``). Same code path; different defaults. + (``intermediate``). The instructor variant ships its OWN dataset + card body (``INSTRUCTOR_BODY``) — inlining the public README would + leak three-tier prose into a single-tier dataset. The actual ``huggingface_hub.HfApi().upload_folder()`` call lives in PR 7.2; this script is intentionally publish-free. ``--dry-run`` writes the README only and skips upload-tree assembly, useful for README iteration. -Failed validation exits with rc=1; pre-flight errors (missing release -dir, missing tier, unsafe ``--huggingface-dir``) exit with rc=2. +Validation discipline: ``run_packager`` runs every check first and +returns ``errors`` WITHOUT writing any artifact when validation fails. +The CLI converts ``errors`` into rc=1. Pre-flight problems (missing +release dir, unsafe ``--huggingface-dir``) raise and exit with rc=2 +without touching disk. """ from __future__ import annotations import argparse -import json -import shutil import sys from collections.abc import Sequence from dataclasses import dataclass from pathlib import Path from typing import Any, Final +import yaml + # Make ``scripts/`` importable regardless of how this file is loaded. sys.path.insert(0, str(Path(__file__).resolve().parent)) -# ``GITHUB_BLOB_BASE`` is re-exported for tests and downstream callers -# (mirrors the symbol surface of ``scripts/package_kaggle_release.py``). from _release_common import ( # noqa: E402,F401 — must follow sys.path insert GITHUB_BLOB_BASE, + SOURCE_TREE_BLOCK, ValidationError, + load_manifest, + replace_dir, + replace_file, rewrite_release_links, validate_cover_image, + validate_readme_substitution, validate_upload_dir_safe, ) @@ -66,17 +74,18 @@ # https://huggingface.co/docs/hub/datasets-cards) # --------------------------------------------------------------------------- -#: Allowed ``size_categories`` tokens (HF taxonomy). Each public tier -#: has 5,000 leads (3500 train / 750 valid / 750 test) so ``1K str: - """Apply the HF-specific rewrites to a copy of the release README. - - Rewrites: +def _hf_public_readme_text(readme: str) -> str: + """Rewrite the public release README into the HF public dataset card. - 1. Source-repo tree diagram → upload-tree diagram (the published - README should describe what the *user* sees on HF, not the - source repo layout). - 2. ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)`` and - ``](validation/...)`` → blob URL — both via - ``rewrite_release_links`` from ``_release_common``. - - The instructor variant calls this with - ``tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK``; otherwise the - public upload tree is used. + 1. Source-repo tree diagram → public upload-tree diagram. + 2. ``](../foo)`` and ``](validation/...)`` → GitHub blob URLs + (delegated to ``rewrite_release_links``). """ - text = readme.replace(HF_TREE_BLOCK_SOURCE, tree_block) - text = rewrite_release_links(text) - return text - + text = readme.replace(SOURCE_TREE_BLOCK, HF_UPLOAD_TREE_BLOCK) + return rewrite_release_links(text) -def _validate_readme_substitution(release_dir: Path) -> list[ValidationError]: - """Guard against silent drift between the README's tree diagram and - ``HF_TREE_BLOCK_SOURCE`` (mirrors PR 5.1's Kaggle-side guard). - Plain string ``replace()`` silently no-ops when the substring is - not found, which would publish the source-repo tree diagram on HF. - """ +# --------------------------------------------------------------------------- +# Instructor-companion body +# +# Inlining the public README into the instructor card published 3-tier +# prose for a 1-tier dataset (PR 5.2 self-review #2). The instructor +# package serves a different audience — researchers who already read +# the public dataset card — so it ships a focused, self-contained body +# that links to the public dataset for shared framing. +# +# The constant is the WHOLE markdown body (no YAML; that comes from the +# generated frontmatter). It is byte-stable; the audit-artifact-sync +# test catches any drift against the committed instructor README. +# --------------------------------------------------------------------------- - readme = release_dir / "README.md" - if not readme.exists(): - return [] - if HF_TREE_BLOCK_SOURCE not in readme.read_text(encoding="utf-8"): - return [ - ValidationError( - field="release/README.md", - message=( - "HF_TREE_BLOCK_SOURCE not found verbatim in release/README.md; " - "the source-repo tree diagram in the README has drifted from " - "the constant in scripts/package_hf_release.py — the " - "HF README rewrite will silently no-op until the README and " - "HF_TREE_BLOCK_SOURCE are reconciled." - ), - ) - ] - return [] +INSTRUCTOR_BODY: Final[str] = f"""\ +# LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion + +This is the **research / instructor companion** to the public +[`leadforge/leadforge-lead-scoring-v1`](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) +dataset. It exposes the **full-horizon** view of a single difficulty +tier (`intermediate`) plus the **hidden causal structure** that the +public dataset deliberately redacts: the world graph (DAG), latent +trait registry, mechanism summary, and full-horizon relational tables +including `customers` and `subscriptions`. + +It exists for instructors who want to walk students through how the +public dataset was generated, and for researchers who want to verify +that the public redactions actually remove the leakage paths the +dataset advertises. **It is not a replacement for the public dataset +in any teaching or modelling context** — students should still train +on the public bundle. + +## What this companion contains + +{HF_INSTRUCTOR_UPLOAD_TREE_BLOCK} + +The single ``intermediate`` config exposes the same train/valid/test +parquet splits as the public dataset's ``intermediate`` config — same +seeds, same row counts (3,500 / 750 / 750), same target. The +difference lives in the relational tables and metadata: + +| File | Public `intermediate` | Instructor companion | +|---|---|---| +| `tables/leads.parquet` | redacted (label dropped) | full (label retained) | +| `tables/opportunities.parquet` | snapshot-filtered + redacted | full-horizon, full columns | +| `tables/customers.parquet` | omitted (would leak label) | included | +| `tables/subscriptions.parquet` | omitted (would leak label) | included | +| `tables/touches.parquet` etc. | filtered to ≤ snapshot day | full 90-day horizon | +| `metadata/world_spec.json` | absent | included (DGP + recipe) | +| `metadata/graph.{{graphml,json}}` | absent | included (hidden DAG) | +| `metadata/latent_registry.json` | absent | included (latent traits) | +| `metadata/mechanism_summary.json` | absent | included (per-edge mechanisms) | + +The redaction contract is single-sourced in +[`leadforge/validation/leakage_probes.py`]({GITHUB_BLOB_BASE}/leadforge/validation/leakage_probes.py) +and re-applied by +[`leadforge/render/relational_snapshot_safe.py`]({GITHUB_BLOB_BASE}/leadforge/render/relational_snapshot_safe.py) +when the public bundle is built; this companion is the unfiltered +source view, so the two are always consistent by construction. + +## Quick start + +```python +from datasets import load_dataset + +# Loads the same train/valid/test splits as the public 'intermediate' +# config; differs only in what `tables/` and `metadata/` provide. +ds = load_dataset( + "leadforge/leadforge-lead-scoring-v1-instructor", + name="intermediate", +) +train = ds["train"].to_pandas() + +# Full-horizon relational tables — includes customers and subscriptions +# (omitted from the public dataset because their existence reconstructs +# the conversion label). +import pandas as pd +customers = pd.read_parquet( + "hf://datasets/leadforge/leadforge-lead-scoring-v1-instructor/intermediate/tables/customers.parquet" +) +``` + +## Intended uses + +- Teaching the **public-vs-instructor split** itself: load both + datasets side-by-side, show students which columns and tables were + redacted, and walk through why each was a leakage path. +- **Verifying the redaction contract:** train a model on the + full-horizon tables, train another on the snapshot-safe public + tables, compare AUC. The gap is the redaction's effect. +- Teaching **causal structure and DGP transparency** using + `metadata/world_spec.json` + `metadata/graph.json`. +- Reproducing the public dataset from the instructor view via + [`leadforge`]({GITHUB_BLOB_BASE}) source code. + +## Out-of-scope uses + +- **Production lead scoring.** Same as the public dataset; the + company, product, and customers are fictional. +- **Modelling with the unredacted view as a baseline.** Models + trained against the full-horizon tables look strong because they're + directly seeing post-conversion events. That number is not a + baseline; it's the ceiling. +- **Demographic / fairness research.** v1 does not model protected + attributes. + +## Composition + +- **Entities.** 9 relational tables (accounts, contacts, leads, + touches, sessions, sales_activities, opportunities, customers, + subscriptions); per-row counts in `manifest.json`. +- **Splits.** Identical to the public `intermediate` config: 70/15/15 + train/valid/test, deterministic given seed 42, recorded in + `tasks/converted_within_90_days/task_manifest.json`. +- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package + version stamped in `manifest.json` along with SHA-256 hashes for + every parquet file. +- **Bundle schema version.** 5 (matches the public dataset). + +## Maintenance, license + +We *want* the dataset to be broken. See the +[public dataset card](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) +for the adversarial-framing pointers, the issue templates, and the +break-me guide. File issues at +[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge); +PRs welcome. + +| Field | Value | +|---|---| +| Generator | leadforge `1.0.0+` | +| Recipe | `b2b_saas_procurement_v1` | +| Canonical seed | 42 | +| Bundle schema version | 5 | +| Format | Parquet (canonical) | +| License | MIT — see [LICENSE](LICENSE) | +| Public dataset | [link](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) | + +Verify integrity with `leadforge validate `; every file is +hashed in `manifest.json`. +""" # --------------------------------------------------------------------------- @@ -264,9 +351,9 @@ class ConfigEntry: class HuggingFaceCard: """Top-level HF dataset-card payload. - ``body`` is the markdown that follows the YAML frontmatter — by - default the rewritten ``release/README.md`` (G12.1 expects the - README to BE the dataset card; the YAML lives at the top). + ``body`` is the markdown that follows the YAML frontmatter — the + rewritten public README for the public variant, the + ``INSTRUCTOR_BODY`` constant for the instructor variant. """ pretty_name: str @@ -369,51 +456,27 @@ def validate_card(card: HuggingFaceCard) -> list[ValidationError]: # --------------------------------------------------------------------------- -# Bundle reading + config building +# Card construction # --------------------------------------------------------------------------- -def _load_manifest(path: Path) -> dict[str, Any]: - payload = json.loads(path.read_text(encoding="utf-8")) - if not isinstance(payload, dict): - raise ValueError(f"manifest.json at {path} is not a JSON object") - return payload - - -def build_config_for_tier( - release_dir: Path, +def _config_for_tier( *, - tier_dir: str, config_name: str, task: str = DEFAULT_TASK, default: bool = False, ) -> ConfigEntry: - """Build a single ``configs[]`` entry for one tier. - - ``tier_dir`` is the directory name under the upload tree - (``intro`` / ``intermediate`` / ``advanced`` for the public - variant; ``intermediate`` for the instructor companion which - flattens ``intermediate_instructor/`` → ``intermediate/`` in the - upload tree). ``config_name`` is what users pass to - ``load_dataset(..., name=...)``. - - The function reads ``manifest.json`` to confirm the tier exists - and the task splits are declared; it does NOT fail when files are - missing on disk because the upload tree is reassembled by - :func:`assemble_upload_dir` and the manifest is the single source - of truth for what should be there. + """Pure constructor — no I/O. + + Builds a ``ConfigEntry`` from the canonical task-split layout. + The earlier draft also opened the manifest to check the task was + declared, but that's I/O for a structural check that + :func:`assemble_upload_dir` already enforces (the parquet copies + fail loud if the source paths are missing). Keeping it pure makes + card construction work without a fully-populated release dir, + which is what tests want. """ - manifest_path = release_dir / tier_dir / "manifest.json" - if not manifest_path.exists(): - raise FileNotFoundError(f"manifest.json missing for tier {tier_dir!r}: {manifest_path}") - manifest = _load_manifest(manifest_path) - if task not in manifest.get("tasks", {}): - raise ValueError( - f"task {task!r} not declared in manifest at {manifest_path}; " - f"got {sorted(manifest.get('tasks', {}))!r}" - ) - data_files = tuple( DataFileEntry( split=hf_split, @@ -424,6 +487,20 @@ def build_config_for_tier( return ConfigEntry(config_name=config_name, data_files=data_files, default=default) +def _assert_tier_dir_exists(release_dir: Path, tier_dir: str) -> None: + """Cheap preflight: tier dir must contain a ``manifest.json``. + + Catches user typos at ``--dry-run`` time rather than waiting for + ``shutil.copytree`` to fail during assembly. No JSON parse — we + only care that the tier exists; the schema is the assembler's + concern. + """ + + manifest = release_dir / tier_dir / "manifest.json" + if not manifest.exists(): + raise FileNotFoundError(f"manifest.json missing for tier {tier_dir!r}: {manifest}") + + def build_card( release_dir: Path, *, @@ -441,62 +518,40 @@ def build_card( """Assemble a ``HuggingFaceCard`` from the release tree. ``variant="public"`` builds the three-tier card pointing at - ``release/{intro,intermediate,advanced}/``; + ``release/{intro,intermediate,advanced}/`` and uses the rewritten + public README as the body. + ``variant="instructor"`` builds a single-config card pointing at - ``release/intermediate_instructor/`` (flattened to - ``intermediate/`` in the upload tree). - - When ``body`` is ``None`` (the default) we lift the contents of - ``release/README.md`` and apply the HF-specific rewrites — HF - renders the body as the dataset card so a full README there is - more useful than a curated blurb (mirrors the Kaggle packager's - description handling). + ``release/intermediate_instructor/`` (flattened to ``intermediate/`` + in the upload tree) and uses :data:`INSTRUCTOR_BODY` as the body. + The instructor variant gets a focused body rather than the public + README to avoid publishing 3-tier prose for a 1-tier dataset. """ if variant == "public": - public_tiers = ( - ("intro", "intro"), - ("intermediate", "intermediate"), - ("advanced", "advanced"), - ) + for tier in DEFAULT_PUBLIC_TIERS: + _assert_tier_dir_exists(release_dir, tier) configs = tuple( - build_config_for_tier( - release_dir, - tier_dir=tier_dir, - config_name=config_name, + _config_for_tier( + config_name=tier, task=task, - default=(config_name == default_config), + default=(tier == default_config), ) - for tier_dir, config_name in public_tiers + for tier in DEFAULT_PUBLIC_TIERS ) if pretty_name is None: pretty_name = DEFAULT_PRETTY_NAME if body is None: - body = _hf_readme_text( + body = _hf_public_readme_text( (release_dir / "README.md").read_text(encoding="utf-8"), - tree_block=HF_UPLOAD_TREE_BLOCK, ) elif variant == "instructor": - # Companion repo flattens ``intermediate_instructor`` → - # ``intermediate`` in the upload tree so the HF dataset slug - # ``leadforge-lead-scoring-v1-instructor`` exposes the - # familiar config name. - configs = ( - build_config_for_tier( - release_dir, - tier_dir="intermediate_instructor", - config_name="intermediate", - task=task, - default=True, - ), - ) + _assert_tier_dir_exists(release_dir, "intermediate_instructor") + configs = (_config_for_tier(config_name="intermediate", task=task, default=True),) if pretty_name is None: pretty_name = DEFAULT_INSTRUCTOR_PRETTY_NAME if body is None: - body = _hf_readme_text( - (release_dir / "README.md").read_text(encoding="utf-8"), - tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK, - ) + body = INSTRUCTOR_BODY else: raise ValueError(f"unknown variant: {variant!r} (expected 'public' or 'instructor')") @@ -515,74 +570,71 @@ def build_card( # --------------------------------------------------------------------------- # Rendering # -# YAML is hand-rolled rather than dumped by PyYAML because the HF -# dataset-card YAML has a precise key order and indentation style that -# the viewer is fussy about (and PyYAML's default flow style collapses -# the configs list into a single line). Hand-rolling also makes the -# determinism contract obvious in the source. -# --------------------------------------------------------------------------- +# YAML serialization uses ``yaml.safe_dump`` with a custom dumper that +# forces indent-2 on top-level sequences (HF dataset-card examples in +# the wild use this style; PyYAML's default puts top-level list items +# at indent 0). The earlier draft hand-rolled the renderer with a +# brittle quoting heuristic; PR 5.2's self-review caught the hack and +# replaced it with PyYAML. # --------------------------------------------------------------------------- -def _yaml_string(value: str) -> str: - """Render a YAML string scalar. +class _IndentedDumper(yaml.SafeDumper): + """Force indent-2 on top-level sequences. - HF parses the frontmatter with PyYAML. We use double-quoted - strings only when the value contains characters that would change - YAML semantics (``:``, leading ``- ``, leading whitespace). Plain - scalars are used otherwise so the YAML stays readable. + PyYAML's default ``increase_indent`` returns ``indentless=True`` + when emitting a top-level block sequence, putting ``- item`` at + column 0. HF dataset cards conventionally indent these by 2 (per + the examples in the HF docs), so we override the flag to ``False``. """ - if value == "": - return '""' - needs_quoting = any(c in value for c in ":#&*!|>'\"%@`") - needs_quoting = needs_quoting or value.startswith(("- ", " ", "?", "[", "{", "*", "&")) - needs_quoting = needs_quoting or value.endswith(" ") - if not needs_quoting: - return value - escaped = value.replace("\\", "\\\\").replace('"', '\\"') - return f'"{escaped}"' + def increase_indent(self, flow: bool = False, indentless: bool = False) -> Any: # noqa: ARG002 + return super().increase_indent(flow, False) + +def _frontmatter_dict(card: HuggingFaceCard) -> dict[str, Any]: + """Build the dict serialised into YAML. -def _yaml_list(values: Sequence[str], *, indent: int) -> list[str]: - """Render a YAML block-style list.""" + Tags are sorted at render time (mirrors the keyword sort in the + Kaggle packager) so dataclass order doesn't leak into the rendered + file. The dict is built in field order so PyYAML preserves it + (with ``sort_keys=False``). + """ - pad = " " * indent - return [f"{pad}- {_yaml_string(v)}" for v in values] + return { + "pretty_name": card.pretty_name, + "license": card.license, + "language": list(card.language), + "task_categories": list(card.task_categories), + "size_categories": list(card.size_categories), + "tags": sorted(card.tags), + "configs": [ + { + "config_name": config.config_name, + **({"default": True} if config.default else {}), + "data_files": [{"split": df.split, "path": df.path} for df in config.data_files], + } + for config in card.configs + ], + } def render_yaml_frontmatter(card: HuggingFaceCard) -> str: """Render the YAML frontmatter as a deterministic string. - Order: ``pretty_name``, ``license``, ``language``, - ``task_categories``, ``size_categories``, ``tags``, ``configs``. - Tags are sorted at render time (mirrors the keyword sort in the - Kaggle packager) so order on the dataclass doesn't leak into the - rendered file. + The leading and trailing ``---`` markers are included so callers + can concatenate this directly to the body. """ - lines: list[str] = ["---"] - lines.append(f"pretty_name: {_yaml_string(card.pretty_name)}") - lines.append(f"license: {_yaml_string(card.license)}") - lines.append("language:") - lines.extend(_yaml_list(card.language, indent=2)) - lines.append("task_categories:") - lines.extend(_yaml_list(card.task_categories, indent=2)) - lines.append("size_categories:") - lines.extend(_yaml_list(card.size_categories, indent=2)) - lines.append("tags:") - lines.extend(_yaml_list(sorted(card.tags), indent=2)) - lines.append("configs:") - for config in card.configs: - lines.append(f" - config_name: {_yaml_string(config.config_name)}") - if config.default: - lines.append(" default: true") - lines.append(" data_files:") - for df in config.data_files: - lines.append(f" - split: {_yaml_string(df.split)}") - lines.append(f" path: {_yaml_string(df.path)}") - lines.append("---") - return "\n".join(lines) + "\n" + payload = yaml.dump( + _frontmatter_dict(card), + Dumper=_IndentedDumper, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + width=200, + ) + return f"---\n{payload}---\n" def render_card(card: HuggingFaceCard) -> str: @@ -599,41 +651,23 @@ def render_card(card: HuggingFaceCard) -> str: # --------------------------------------------------------------------------- -def _replace_file(src: Path, dst: Path) -> None: - """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``.""" - - if dst.is_symlink() or dst.is_file(): - dst.unlink() - elif dst.exists() and dst.is_dir(): - shutil.rmtree(dst) - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) - - -def _replace_dir(src: Path, dst: Path) -> None: - """Copy directory ``src`` → ``dst``, replacing any existing entry.""" - - if dst.is_symlink() or dst.is_file(): - dst.unlink() - elif dst.exists() and dst.is_dir(): - shutil.rmtree(dst) - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copytree(src, dst) - - def assemble_upload_dir( release_dir: Path, upload_dir: Path, *, variant: str = "public", + rendered_readme: str, cover_image: Path = DEFAULT_COVER_IMAGE, ) -> None: """Assemble ``upload_dir`` for ``huggingface_hub.upload_folder()``. - Mirrors PR 5.1's Kaggle assembler: real-file copies of the per- - tier bundles + cover image + LICENSE + the rewritten README. No - symlinks (the ``datasets`` library walks the upload tree and silent - skips broken symlinks in some versions). + Writes the README, copies the cover image and LICENSE, and copies + each tier bundle as a real-file copy (no symlinks; ``datasets`` + walks the tree and silently skips broken symlinks in some + versions). ``rendered_readme`` is the already-validated and + rendered card text — passing it in (rather than reading it back + from ``run_packager``) means this function produces a complete + upload tree on its own. For ``variant="instructor"``, the source directory ``intermediate_instructor/`` is flattened to ``intermediate/`` in @@ -645,25 +679,31 @@ def assemble_upload_dir( validate_upload_dir_safe(upload_dir, release_dir, kind=kind) upload_dir.mkdir(parents=True, exist_ok=True) + # README — written from the validated card text. + readme_path = upload_dir / "README.md" + if readme_path.is_symlink() or readme_path.is_file(): + readme_path.unlink() + readme_path.write_text(rendered_readme, encoding="utf-8") + # Cover image — reuse the one ``release/`` already ships. cover_src = release_dir / cover_image.name if not cover_src.exists(): cover_src = cover_image if cover_src.exists(): - _replace_file(cover_src, upload_dir / COVER_IMAGE_FILENAME) + replace_file(cover_src, upload_dir / cover_image.name) # LICENSE — straight copy, no rewriting. license_src = release_dir / "LICENSE" if license_src.exists(): - _replace_file(license_src, upload_dir / "LICENSE") + replace_file(license_src, upload_dir / "LICENSE") # Per-tier bundles — full directory copies. The instructor variant # flattens its source dir name. if variant == "public": for tier in DEFAULT_PUBLIC_TIERS: - _replace_dir(release_dir / tier, upload_dir / tier) + replace_dir(release_dir / tier, upload_dir / tier) elif variant == "instructor": - _replace_dir( + replace_dir( release_dir / "intermediate_instructor", upload_dir / "intermediate", ) @@ -699,6 +739,11 @@ def run_packager( ) -> PackagerOutcome: """Build, validate, and write the HF dataset card. + Validation discipline: build → validate → ONLY THEN write. If + validation fails the function returns an outcome with populated + ``errors`` and **no artifact on disk**. The CLI converts errors + to rc=1. + With ``dry_run=False`` (the default) the packager additionally assembles the HF-loadable upload directory under ``huggingface_dir`` (real-file copies of the per-tier bundles + @@ -720,24 +765,43 @@ def run_packager( errors: list[ValidationError] = [] errors.extend(validate_card(card)) errors.extend(validate_cover_image(cover_image)) - errors.extend(_validate_readme_substitution(release_dir)) + if variant == "public": + # The instructor body doesn't substitute SOURCE_TREE_BLOCK — + # it's a self-contained markdown — so the substitution guard + # only applies to the public variant. + errors.extend(validate_readme_substitution(release_dir, packager_name="HF")) readme_path = huggingface_dir / "README.md" - readme_path.parent.mkdir(parents=True, exist_ok=True) - readme_path.write_text(render_card(card), encoding="utf-8") - if not dry_run: + # Validation gate: don't leave broken artifacts on disk. + if errors: + return PackagerOutcome( + card=card, + readme_path=readme_path, + errors=tuple(errors), + assembled=False, + ) + + rendered = render_card(card) + huggingface_dir.mkdir(parents=True, exist_ok=True) + + if dry_run: + # README-only — fast iteration path. Write directly, don't + # invoke the assembler. + readme_path.write_text(rendered, encoding="utf-8") + else: assemble_upload_dir( release_dir, huggingface_dir, variant=variant, + rendered_readme=rendered, cover_image=cover_image, ) return PackagerOutcome( card=card, readme_path=readme_path, - errors=tuple(errors), + errors=(), assembled=not dry_run, ) @@ -778,25 +842,8 @@ def _parse_args(argv: Sequence[str] | None) -> argparse.Namespace: default=DEFAULT_DEFAULT_CONFIG, help=( "config that carries default: true (G12.2; " - "default: %(default)s for public; ignored for instructor)" - ), - ) - parser.add_argument( - "--owner", - default=DEFAULT_OWNER, - help=( - "HF dataset owner — currently informational; PR 7.2 will " - "consume it via ``huggingface_hub.HfApi().upload_folder``. " - "default: %(default)s" - ), - ) - parser.add_argument( - "--dataset-slug", - default=None, - help=( - "HF dataset slug — currently informational; PR 7.2 will " - "consume it. defaults: leadforge-lead-scoring-v1 (public) / " - "leadforge-lead-scoring-v1-instructor (instructor)" + "default: %(default)s; rejected for --variant=instructor " + "which has only one config)" ), ) parser.add_argument( @@ -820,6 +867,16 @@ def main(argv: Sequence[str] | None = None) -> int: DEFAULT_HUGGINGFACE_DIR if variant == "public" else DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR ) + # Reject silent misconfiguration — instructor variant has one + # config and ``--default-config`` is meaningless for it. + if variant == "instructor" and args.default_config != DEFAULT_DEFAULT_CONFIG: + print( + f"error: --default-config={args.default_config!r} is not valid with " + f"--variant=instructor (instructor has a single 'intermediate' config)", + file=sys.stderr, + ) + return 2 + try: outcome = run_packager( args.release_dir, diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py index b8221c0..c83d5c9 100644 --- a/scripts/package_kaggle_release.py +++ b/scripts/package_kaggle_release.py @@ -43,7 +43,6 @@ import argparse import json import re -import shutil import sys from collections.abc import Sequence from dataclasses import dataclass, field @@ -59,24 +58,25 @@ # ``importlib.util.spec_from_file_location`` from the test suite). sys.path.insert(0, str(Path(__file__).resolve().parent)) -# Shared release-packaging primitives extracted in PR 5.2 so the HF -# packager can reuse the link-rewriter, the dir-safety guard, and the -# cover-image validator without duplicating logic. -from _release_common import ( # noqa: E402 — must follow sys.path insert +# Shared release-packaging primitives. PR 5.2 extracted these so the +# Kaggle and HF packagers share one source of truth for link rewriting, +# upload-dir safety, cover-image validation, source-tree-block drift +# detection, idempotent file/dir replacement, and manifest reading. +# ``GITHUB_BLOB_BASE`` is re-exported for tests asserting blob URLs in +# rewritten README content; its presence here is a public-symbol +# contract, not a local consumer. +from _release_common import ( # noqa: E402,F401 — must follow sys.path insert GITHUB_BLOB_BASE, + SOURCE_TREE_BLOCK, ValidationError, + load_manifest, + replace_dir, + replace_file, + rewrite_release_links, validate_cover_image, + validate_readme_substitution, validate_upload_dir_safe, ) -from _release_common import ( - PARENT_RELATIVE_LINK_RE as _PARENT_RELATIVE_LINK, -) -from _release_common import ( - VALIDATION_REPORT_LINK as _VALIDATION_REPORT_LINK, -) -from _release_common import ( - VALIDATION_REPORT_URL as _VALIDATION_REPORT_URL, -) # --------------------------------------------------------------------------- # Kaggle field constraints (chatgpt v2 §19, verified from official docs) @@ -210,30 +210,14 @@ class UserSource: # --------------------------------------------------------------------------- # # ``GITHUB_BLOB_BASE``, the ``](../foo)`` regex, and the validation- -# report link rewrite live in ``_release_common`` (PR 5.2). Tree-block -# substitution stays local because the upload tree differs per -# platform (Kaggle ships ``dataset-metadata.json`` + cover image at -# the top; HF does not). +# report link rewrite live in ``_release_common`` (PR 5.2), as does the +# source-side tree-diagram constant (``SOURCE_TREE_BLOCK``). Only the +# Kaggle-specific upload-tree diagram lives here. -#: The "What's inside" tree diagram in ``release/README.md``. The -#: published README on Kaggle should describe the *upload* layout +#: The published README on Kaggle should describe the *upload* layout #: (which has dataset-metadata.json + cover image at the top, no #: instructor companion, no notebooks/validation siblings), not the #: source-repo layout — we substitute the block on the way out. -KAGGLE_TREE_BLOCK: Final[str] = """``` -release/ -├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier -│ ├── manifest.json # provenance + file hashes -│ ├── dataset_card.md # auto-rendered per-bundle card -│ ├── feature_dictionary.csv # authoritative column spec -│ ├── lead_scoring.csv # flat convenience CSV (all splits) -│ ├── tables/*.parquet # 7 snapshot-safe relational tables -│ └── tasks/converted_within_90_days/{train,valid,test}.parquet -├── intermediate_instructor/ # research companion: full-horizon tables + metadata/ -├── notebooks/01_baseline_lead_scoring.ipynb -└── validation/ # validation_report.{json,md} + figures -```""" - KAGGLE_UPLOAD_TREE_BLOCK: Final[str] = """``` . ├── intro/ intermediate/ advanced/ # student_public bundles, one per difficulty tier @@ -255,20 +239,15 @@ def _kaggle_readme_text(readme: str) -> str: Rewrites: - 1. Source-repo tree diagram → upload-tree diagram (the published - README should describe what the *user* sees on Kaggle, not the - source repo layout). - 2. ``](../foo)`` → ``]({GITHUB_BLOB_BASE}/foo)`` (markdown links - that escape the bundle root resolve to the source repo on - GitHub). - 3. ``](validation/validation_report.md)`` → blob URL (the - validation report does not ship to Kaggle; readers click - through to GitHub). + 1. ``SOURCE_TREE_BLOCK`` → ``KAGGLE_UPLOAD_TREE_BLOCK`` (the + published README should describe what the *user* sees on + Kaggle, not the source repo layout). + 2. ``](../foo)`` and ``](validation/validation_report.md)`` → + GitHub blob URLs (delegated to ``rewrite_release_links``). """ - text = readme.replace(KAGGLE_TREE_BLOCK, KAGGLE_UPLOAD_TREE_BLOCK) - text = _PARENT_RELATIVE_LINK.sub(rf"]({GITHUB_BLOB_BASE}/\1)", text) - text = text.replace(_VALIDATION_REPORT_LINK, _VALIDATION_REPORT_URL) + text = readme.replace(SOURCE_TREE_BLOCK, KAGGLE_UPLOAD_TREE_BLOCK) + text = rewrite_release_links(text) return text @@ -553,11 +532,7 @@ def fields_from_parquet(path: Path) -> tuple[FieldDescriptor, ...]: return tuple(FieldDescriptor(name=f.name, type=_kaggle_type_from_arrow(f.type)) for f in schema) -def _load_manifest(path: Path) -> dict[str, Any]: - payload = json.loads(path.read_text(encoding="utf-8")) - if not isinstance(payload, dict): - raise ValueError(f"manifest.json at {path} is not a JSON object") - return payload +# ``_load_manifest`` is now ``load_manifest`` in ``_release_common``. def build_tier_resources( @@ -583,7 +558,7 @@ def build_tier_resources( flat_csv_path = tier_dir / "lead_scoring.csv" fields = _flat_csv_fields(flat_csv_path, feature_dict) - manifest = _load_manifest(tier_dir / "manifest.json") + manifest = load_manifest(tier_dir / "manifest.json") table_inventory = manifest.get("tables", {}) snapshot_day = manifest.get("snapshot_day") @@ -785,38 +760,8 @@ def render_metadata_json(metadata: DatasetMetadata) -> str: # --------------------------------------------------------------------------- -def _validate_kaggle_dir_safe(kaggle_dir: Path, release_dir: Path) -> None: - """Refuse to assemble into a path that aliases something dangerous. - - Thin wrapper that forwards to ``_release_common.validate_upload_dir_safe`` - with ``kind="kaggle"`` so the error message identifies which packager - refused. The standalone function is kept for the test in - ``tests/scripts/test_package_kaggle_release.py`` that imports it directly. - """ - - validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle") - - -def _replace_file(src: Path, dst: Path) -> None: - """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``.""" - - if dst.is_symlink() or dst.is_file(): - dst.unlink() - elif dst.exists() and dst.is_dir(): - shutil.rmtree(dst) - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) - - -def _replace_dir(src: Path, dst: Path) -> None: - """Copy directory ``src`` → ``dst``, replacing any existing entry.""" - - if dst.is_symlink() or dst.is_file(): - dst.unlink() - elif dst.exists() and dst.is_dir(): - shutil.rmtree(dst) - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copytree(src, dst) +# ``_validate_kaggle_dir_safe``, ``_replace_file``, ``_replace_dir`` +# now live in ``_release_common`` (PR 5.2) under their public names. def assemble_upload_dir( @@ -836,25 +781,26 @@ def assemble_upload_dir( children — switching to copies removes that fragility at the cost of ~15 MB of disk per assembly run, which is gitignored anyway. - Re-running the assembly is idempotent: ``_replace_file`` and - ``_replace_dir`` rmtree-then-copy any existing entry. The README - is the one file rewritten on the way in (tree diagram + ``../`` - links). ``--dry-run`` skips this whole function. + Re-running the assembly is idempotent: ``replace_file`` and + ``replace_dir`` (from ``_release_common``) rmtree-then-copy any + existing entry. The README is the one file rewritten on the way + in (tree diagram + ``../`` links). ``--dry-run`` skips this + whole function. """ - _validate_kaggle_dir_safe(kaggle_dir, release_dir) + validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle") kaggle_dir.mkdir(parents=True, exist_ok=True) # Cover image. cover_src = release_dir / cover_image.name if not cover_src.exists(): cover_src = cover_image - _replace_file(cover_src, kaggle_dir / cover_image.name) + replace_file(cover_src, kaggle_dir / cover_image.name) # LICENSE — straight copy, no rewriting. license_src = release_dir / "LICENSE" if license_src.exists(): - _replace_file(license_src, kaggle_dir / "LICENSE") + replace_file(license_src, kaggle_dir / "LICENSE") # README.md — real copy with link rewriting so ``../`` links and # the directory diagram resolve correctly on the Kaggle dataset @@ -873,7 +819,7 @@ def assemble_upload_dir( # Per-tier bundles — full directory copies. for tier in tiers: tier_src = release_dir / tier - _replace_dir(tier_src, kaggle_dir / tier) + replace_dir(tier_src, kaggle_dir / tier) # --------------------------------------------------------------------------- @@ -941,9 +887,24 @@ def run_packager( errors: list[ValidationError] = [] errors.extend(validate_metadata(metadata)) errors.extend(validate_cover_image(cover_image)) - errors.extend(_validate_readme_substitution(release_dir)) + errors.extend(validate_readme_substitution(release_dir, packager_name="Kaggle")) metadata_path = kaggle_dir / "dataset-metadata.json" + + # Validation gate: if anything failed, return errors WITHOUT writing + # any artifact. Writing on validation failure leaves a corrupt + # metadata file on disk that can be accidentally committed; the + # earlier draft did this and PR 5.2's self-review caught it. The + # CLI converts ``errors`` into rc=1; callers who want a record of + # the corrupt artifact can dry-run after fixing inputs. + if errors: + return PackagerOutcome( + metadata=metadata, + metadata_path=metadata_path, + errors=tuple(errors), + assembled=False, + ) + metadata_path.parent.mkdir(parents=True, exist_ok=True) metadata_path.write_text(render_metadata_json(metadata), encoding="utf-8") @@ -953,42 +914,11 @@ def run_packager( return PackagerOutcome( metadata=metadata, metadata_path=metadata_path, - errors=tuple(errors), + errors=(), assembled=not dry_run, ) -def _validate_readme_substitution(release_dir: Path) -> list[ValidationError]: - """Guard against silent drift between the README's tree diagram - and ``KAGGLE_TREE_BLOCK``. - - ``_kaggle_readme_text`` substitutes the source-repo tree diagram - for the upload-tree diagram via plain string replace. If the - README's tree changes by even one whitespace character, the - substitution silently no-ops and the published Kaggle dataset - card shows the source-repo tree (with ``intermediate_instructor/``, - ``notebooks/``, ``validation/``). We catch that case here. - """ - - readme = release_dir / "README.md" - if not readme.exists(): - return [] # No README is itself a release-day issue, but not this validator's concern. - if KAGGLE_TREE_BLOCK not in readme.read_text(encoding="utf-8"): - return [ - ValidationError( - field="release/README.md", - message=( - "KAGGLE_TREE_BLOCK not found verbatim in release/README.md; " - "the source-repo tree diagram in the README has drifted from " - "the constant in scripts/package_kaggle_release.py — the " - "Kaggle description rewrite will silently no-op until the " - "README and KAGGLE_TREE_BLOCK are reconciled." - ), - ) - ] - return [] - - # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- diff --git a/tests/scripts/test_package_hf_release.py b/tests/scripts/test_package_hf_release.py index 1328c8d..0d0a9ea 100644 --- a/tests/scripts/test_package_hf_release.py +++ b/tests/scripts/test_package_hf_release.py @@ -14,7 +14,7 @@ * the README link-rewriting that lets the published dataset card on HF keep working ``../`` links (rewritten to GitHub blob URLs) and a directory diagram that reflects the upload layout, plus a guard - that the source ``HF_TREE_BLOCK_SOURCE`` is still present verbatim + that the source ``SOURCE_TREE_BLOCK`` (in ``_release_common``) is still present verbatim in the README (silent-failure trap; mirrors PR 5.1's KAGGLE block) * the assembled upload tree resolves every declared resource path * the safety net that refuses to assemble into ``cwd`` / @@ -47,6 +47,7 @@ _RELEASE_BUNDLES_PRESENT = (_RELEASE_DIR / "intro" / "manifest.json").exists() _INSTRUCTOR_BUNDLE_PRESENT = (_RELEASE_DIR / "intermediate_instructor" / "manifest.json").exists() _COMMITTED_README = _REPO_ROOT / "release" / "huggingface" / "README.md" +_COMMITTED_INSTRUCTOR_README = _REPO_ROOT / "release" / "huggingface-instructor" / "README.md" _COMMITTED_COVER = _REPO_ROOT / "release" / "dataset-cover-image.png" @@ -120,7 +121,7 @@ def test_validate_card_reports_every_field_violation() -> None: assert "configs" in fields -def test_validate_card_requires_exactly_one_default(tmp_path: Path) -> None: +def test_validate_card_requires_exactly_one_default() -> None: """G12.2 — exactly one config carries ``default: true``.""" base = _minimal_card() @@ -184,21 +185,33 @@ def test_validate_card_flags_data_files_without_split_or_path() -> None: # --------------------------------------------------------------------------- -def test_render_yaml_frontmatter_round_trips_through_pyyaml() -> None: +@pytest.mark.parametrize( + "pretty_name", + [ + packager.DEFAULT_PRETTY_NAME, + # Em-dash flavor — matches the instructor variant; locks down + # PyYAML round-tripping for non-ASCII content (regression + # guard for the PR 5.2 self-review note about the hand-rolled + # renderer's incomplete coverage). + packager.DEFAULT_INSTRUCTOR_PRETTY_NAME, + ], +) +def test_render_yaml_frontmatter_round_trips_through_pyyaml(pretty_name: str) -> None: """The rendered YAML must parse cleanly via ``yaml.safe_load``. - HF parses dataset-card frontmatter with PyYAML; if our hand-rolled - renderer drifts from valid YAML, the dataset card silently drops - its metadata on the HF page. + HF parses dataset-card frontmatter with PyYAML; if our renderer + drifts from valid YAML, the dataset card silently drops its + metadata on the HF page. """ - card = _minimal_card() + base = _minimal_card() + card = packager.HuggingFaceCard(**{**base.__dict__, "pretty_name": pretty_name}) yaml_text = packager.render_yaml_frontmatter(card) # ``render_yaml_frontmatter`` includes the leading and trailing # ``---`` markers. Strip them before feeding to safe_load. inner = yaml_text.strip().strip("-").strip() parsed = yaml.safe_load(inner) - assert parsed["pretty_name"] == card.pretty_name + assert parsed["pretty_name"] == pretty_name assert parsed["license"] == "mit" assert parsed["language"] == ["en"] assert parsed["task_categories"] == [packager.HF_TASK_CATEGORY] @@ -216,22 +229,8 @@ def test_render_yaml_tags_sorted_at_render_time() -> None: **{**base.__dict__, "tags": ("zebra", "alpha", "mango")}, ) rendered = packager.render_yaml_frontmatter(shuffled) - # Tag block lines, in the rendered order. - tag_lines = [ - line.strip().lstrip("- ") for line in rendered.splitlines() if line.startswith(" - ") - ] - # The first three list-item lines are language, task_categories, - # size_categories — find tag block by position after ``tags:``. - lines = rendered.splitlines() - tags_idx = lines.index("tags:") - block: list[str] = [] - for line in lines[tags_idx + 1 :]: - if line.startswith(" - "): - block.append(line[4:].strip()) - else: - break - assert block == ["alpha", "mango", "zebra"] - _ = tag_lines # silence linter + parsed = yaml.safe_load(rendered.strip().strip("-").strip()) + assert parsed["tags"] == ["alpha", "mango", "zebra"] # --------------------------------------------------------------------------- @@ -240,9 +239,9 @@ def test_render_yaml_tags_sorted_at_render_time() -> None: @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") -def test_hf_readme_text_rewrites_links_and_tree_diagram() -> None: +def test_hf_public_readme_text_rewrites_links_and_tree_diagram() -> None: readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8") - rewritten = packager._hf_readme_text(readme) + rewritten = packager._hf_public_readme_text(readme) # Source-repo tree → upload tree. assert "intermediate_instructor/" not in rewritten @@ -260,10 +259,10 @@ def test_hf_readme_text_rewrites_links_and_tree_diagram() -> None: @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") -def test_hf_tree_block_source_present_in_release_readme() -> None: +def test_source_tree_block_is_present_in_release_readme() -> None: """Silent-failure guard. - ``_hf_readme_text`` substitutes ``HF_TREE_BLOCK_SOURCE`` → + ``_hf_public_readme_text`` substitutes ``SOURCE_TREE_BLOCK`` → ``HF_UPLOAD_TREE_BLOCK`` via plain string replace. If anyone tweaks the README's tree diagram by even one whitespace character, the substitution silently no-ops and the published HF @@ -272,28 +271,28 @@ def test_hf_tree_block_source_present_in_release_readme() -> None: """ readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8") - assert packager.HF_TREE_BLOCK_SOURCE in readme, ( - "scripts/package_hf_release.py HF_TREE_BLOCK_SOURCE no longer matches " + assert packager.SOURCE_TREE_BLOCK in readme, ( + "scripts/_release_common.py SOURCE_TREE_BLOCK no longer matches " "the tree diagram in release/README.md — reconcile the two before " "the next HF README regeneration." ) def test_validate_readme_substitution_flags_drift(tmp_path: Path) -> None: - """``_validate_readme_substitution`` is wired into the run-time - validator, not just the static guard above.""" + """``validate_readme_substitution`` (now in ``_release_common``) is + wired into the run-time validator, not just the static guard above.""" fake_release = tmp_path / "release" fake_release.mkdir() (fake_release / "README.md").write_text("# Some unrelated README\n", encoding="utf-8") - errors = packager._validate_readme_substitution(fake_release) + errors = packager.validate_readme_substitution(fake_release, packager_name="HF") assert errors assert errors[0].field == "release/README.md" - assert "HF_TREE_BLOCK_SOURCE" in errors[0].message + assert "SOURCE_TREE_BLOCK" in errors[0].message if _RELEASE_BUNDLES_PRESENT: # Sanity: the real release README does NOT trigger the validator. - assert packager._validate_readme_substitution(_RELEASE_DIR) == [] + assert packager.validate_readme_substitution(_RELEASE_DIR, packager_name="HF") == [] # --------------------------------------------------------------------------- @@ -307,9 +306,9 @@ def test_assemble_upload_dir_rejects_unsafe_huggingface_dir(tmp_path: Path) -> N fake_release = tmp_path / "release" fake_release.mkdir() with pytest.raises(ValueError, match="unsafe"): - packager.assemble_upload_dir(fake_release, fake_release) + packager.assemble_upload_dir(fake_release, fake_release, rendered_readme="") with pytest.raises(ValueError, match="unsafe"): - packager.assemble_upload_dir(fake_release, fake_release.parent) + packager.assemble_upload_dir(fake_release, fake_release.parent, rendered_readme="") def test_assemble_upload_dir_rejects_huggingface_dir_equal_to_cwd( @@ -328,7 +327,48 @@ def test_assemble_upload_dir_rejects_huggingface_dir_equal_to_cwd( cwd.mkdir() monkeypatch.chdir(cwd) with pytest.raises(ValueError, match="unsafe"): - packager.assemble_upload_dir(fake_release, cwd) + packager.assemble_upload_dir(fake_release, cwd, rendered_readme="") + + +def test_assemble_upload_dir_rejects_descendant_of_release_dir(tmp_path: Path) -> None: + """Refuse to assemble into a strict descendant of ``release_dir``. + + Per PR 5.2 self-review #3: ``--huggingface-dir release/intro`` + would otherwise rmtree the intro tier bundle. Only direct + children of ``release_dir`` are allowed (the canonical + ``release/huggingface``, ``release/huggingface-instructor``, + ``release/kaggle`` shapes). + """ + + fake_release = tmp_path / "release" + (fake_release / "intro" / "tables").mkdir(parents=True) + deep = fake_release / "intro" / "tables" # 2 levels under release_dir + with pytest.raises(ValueError, match="unsafe"): + packager.assemble_upload_dir(fake_release, deep, rendered_readme="") + + +def test_assemble_upload_dir_allows_canonical_child(tmp_path: Path) -> None: + """Direct-child of release_dir IS the canonical safe location. + + ``release/huggingface`` is allowed; only deeper nesting trips the + descendant guard. + """ + + fake_release = tmp_path / "release" + fake_release.mkdir() + safe_child = fake_release / "huggingface" + # Should not raise. The function may still fail later because the + # source bundles aren't present, but the safety guard must let it + # through. + try: + packager.assemble_upload_dir(fake_release, safe_child, rendered_readme="") + except ValueError as exc: + if "unsafe" in str(exc): + raise AssertionError(f"safety guard rejected canonical child: {exc}") from exc + except FileNotFoundError: + # Expected — fake_release has no tier bundles, so the copytree + # call fails after the safety check. That's the right shape. + pass @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") @@ -358,7 +398,7 @@ def test_assembled_upload_dir_resolves_every_declared_data_file(tmp_path: Path) # Top-level required artefacts. assert (upload_dir / "README.md").is_file() - assert (upload_dir / packager.COVER_IMAGE_FILENAME).is_file() + assert (upload_dir / _COMMITTED_COVER.name).is_file() assert (upload_dir / "LICENSE").is_file() @@ -402,6 +442,56 @@ def test_main_reports_missing_release_dir( assert "release directory not found" in captured.err +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None: + """Validation failure must NOT leave a corrupt README on disk + (PR 5.2 self-review #1). + + Forces a validation failure by passing a cover image that is too + small. Asserts the README path doesn't materialise and + ``outcome.errors`` is populated. + """ + + tiny_cover = tmp_path / "tiny.png" + from PIL import Image + + Image.new("RGB", (10, 10), (0, 0, 0)).save(tiny_cover) + out_dir = tmp_path / "huggingface" + outcome = packager.run_packager(_RELEASE_DIR, huggingface_dir=out_dir, cover_image=tiny_cover) + assert outcome.errors, "expected at least one validation error" + assert not (out_dir / "README.md").exists(), "README must not be written when validation fails" + assert outcome.assembled is False + + +def test_main_rejects_default_config_with_instructor_variant( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + """``--default-config`` is meaningless for ``--variant=instructor`` + (only one config); silently accepting it is a misconfiguration + (PR 5.2 self-review #10). + """ + + rc = packager.main( + [ + "--release-dir", + str(_RELEASE_DIR), + "--huggingface-dir", + str(tmp_path / "hf"), + "--variant", + "instructor", + "--default-config", + "advanced", + "--cover-image", + str(_COMMITTED_COVER), + "--dry-run", + ] + ) + captured = capsys.readouterr() + assert rc == 2 + assert "--default-config" in captured.err + assert "instructor" in captured.err + + # --------------------------------------------------------------------------- # Determinism + sync with committed artefact # --------------------------------------------------------------------------- @@ -473,6 +563,32 @@ def test_committed_hf_readme_matches_fresh_regeneration(tmp_path: Path) -> None: assert packager.GITHUB_BLOB_BASE in body +@pytest.mark.skipif( + not (_INSTRUCTOR_BUNDLE_PRESENT and _COMMITTED_INSTRUCTOR_README.exists()), + reason="instructor bundle or committed instructor README missing", +) +def test_committed_instructor_readme_matches_fresh_regeneration(tmp_path: Path) -> None: + """Audit-artifact-sync for the instructor companion. + + A fresh instructor regeneration must match + ``release/huggingface-instructor/README.md`` byte-for-byte. Locks + down the dedicated ``INSTRUCTOR_BODY`` constant introduced in PR + 5.2 self-review #2. + """ + + fresh_dir = tmp_path / "huggingface-instructor" + packager.run_packager( + _RELEASE_DIR, + huggingface_dir=fresh_dir, + variant="instructor", + cover_image=_COMMITTED_COVER, + dry_run=True, + ) + fresh_bytes = (fresh_dir / "README.md").read_bytes() + committed_bytes = _COMMITTED_INSTRUCTOR_README.read_bytes() + assert fresh_bytes == committed_bytes + + # --------------------------------------------------------------------------- # Instructor companion (G12.4) # --------------------------------------------------------------------------- @@ -500,10 +616,21 @@ def test_run_packager_instructor_variant_packages_independently(tmp_path: Path) # ``intermediate/`` in the upload tree. for df in only.data_files: assert (upload_dir / df.path).is_file() - # Instructor body uses the instructor tree block. + # Instructor body is the dedicated INSTRUCTOR_BODY constant, not + # the public README inlined verbatim — locks down PR 5.2 self- + # review fix #2 (3-tier prose was leaking into the 1-tier card). body = outcome.card.body - assert "metadata/" in body # instructor metadata sibling visible - assert "9 full-horizon tables" in body + assert body is packager.INSTRUCTOR_BODY + # Public-tier names must NOT appear in the instructor body. + assert "intro" not in body.lower().split() # word boundary, not substring + # The instructor body talks about the instructor companion role. + assert "Instructor companion" in body + assert "redaction" in body.lower() + assert "metadata/world_spec" in body + # Tree block reflects the instructor (single-tier) layout. + assert "intermediate/" in body + # No leakage of public tree elements. + assert "intro/ intermediate/ advanced/" not in body # --------------------------------------------------------------------------- diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py index 351950b..3790b5f 100644 --- a/tests/scripts/test_package_kaggle_release.py +++ b/tests/scripts/test_package_kaggle_release.py @@ -7,8 +7,8 @@ * the README link-rewriting that lets the published dataset card on Kaggle keep working ``../`` links (rewritten to GitHub blob URLs) and a directory diagram that reflects the upload layout, plus a - guard that the source ``KAGGLE_TREE_BLOCK`` is still present - verbatim in the README (silent-failure trap) + guard that the source ``SOURCE_TREE_BLOCK`` (in ``_release_common``) + is still present verbatim in the README (silent-failure trap) * the assembled upload tree resolves every declared resource path (so ``kaggle datasets create`` can find each file) * the safety net that refuses to assemble into ``cwd`` / @@ -224,10 +224,10 @@ def test_kaggle_readme_text_rewrites_links_and_tree_diagram() -> None: @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") -def test_kaggle_tree_block_is_present_in_release_readme() -> None: +def test_source_tree_block_is_present_in_release_readme() -> None: """Silent-failure guard. - ``_kaggle_readme_text`` substitutes ``KAGGLE_TREE_BLOCK`` → + ``_kaggle_readme_text`` substitutes ``SOURCE_TREE_BLOCK`` → ``KAGGLE_UPLOAD_TREE_BLOCK`` via plain string replace. If anyone tweaks the README's tree diagram by even one whitespace character, the substitution silently no-ops and the published @@ -236,8 +236,8 @@ def test_kaggle_tree_block_is_present_in_release_readme() -> None: """ readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8") - assert packager.KAGGLE_TREE_BLOCK in readme, ( - "scripts/package_kaggle_release.py KAGGLE_TREE_BLOCK no longer matches " + assert packager.SOURCE_TREE_BLOCK in readme, ( + "scripts/_release_common.py SOURCE_TREE_BLOCK no longer matches " "the tree diagram in release/README.md — reconcile the two before " "the next release-metadata regeneration." ) @@ -245,19 +245,19 @@ def test_kaggle_tree_block_is_present_in_release_readme() -> None: @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_validate_readme_substitution_flags_drift(tmp_path: Path) -> None: - """``_validate_readme_substitution`` is wired into the run-time - validator, not just the static guard above.""" + """``validate_readme_substitution`` (now in ``_release_common``) is + wired into the run-time validator, not just the static guard above.""" fake_release = tmp_path / "release" fake_release.mkdir() (fake_release / "README.md").write_text("# Some unrelated README\n", encoding="utf-8") - errors = packager._validate_readme_substitution(fake_release) + errors = packager.validate_readme_substitution(fake_release, packager_name="Kaggle") assert errors assert errors[0].field == "release/README.md" - assert "KAGGLE_TREE_BLOCK" in errors[0].message + assert "SOURCE_TREE_BLOCK" in errors[0].message # Sanity: the real release README does NOT trigger the validator. - assert packager._validate_readme_substitution(_RELEASE_DIR) == [] + assert packager.validate_readme_substitution(_RELEASE_DIR, packager_name="Kaggle") == [] @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") @@ -368,6 +368,27 @@ def test_assemble_upload_dir_idempotent_against_existing_tree(tmp_path: Path) -> # --------------------------------------------------------------------------- +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None: + """A failed validation must NOT leave a corrupt metadata file on + disk (PR 5.2 self-review fix). + + Forces a validation failure by passing a cover image that is too + small for Kaggle's 560×280 floor. Asserts the metadata path + doesn't materialise and ``outcome.errors`` is populated. + """ + + tiny_cover = tmp_path / "tiny.png" + Image.new("RGB", (10, 10), (0, 0, 0)).save(tiny_cover) + out_dir = tmp_path / "kaggle" + outcome = packager.run_packager(_RELEASE_DIR, kaggle_dir=out_dir, cover_image=tiny_cover) + assert outcome.errors, "expected at least one validation error" + assert not (out_dir / "dataset-metadata.json").exists(), ( + "metadata file must not be written when validation fails" + ) + assert outcome.assembled is False + + def test_main_reports_missing_release_dir( tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: From 42260a80e9dbea4d12ae725e7912741a89355c6a Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 7 May 2026 00:03:32 +0300 Subject: [PATCH 3/3] PR 5.2 Copilot-review fixes (Kaggle + HF packagers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fold Copilot's two real findings on the self-review revision back in. COPILOT-1 — validate_upload_dir_safe was only invoked inside assemble_upload_dir, which --dry-run skips. A dry-run with --huggingface-dir release (or .) would write the README into the unsafe path BEFORE the safety net fired. Hoist the check into run_packager (both packagers) so it runs before any mkdir or write; the inner assemble_upload_dir call stays as defence-in-depth for direct callers. New tests: dry-run with unsafe upload-dir raises without writing; the same path through main() returns rc=2. COPILOT-2 — Cover-image path resolution was inconsistent: validate_cover_image used cover_image as passed, while assemble_upload_dir did a separate ``release_dir / cover_image.name`` fallback. Diverged for bare-basename inputs (false validation failures) and two-paths-sharing-a-basename (assembler shadowing the explicit path). Added resolve_cover_image_path() to _release_common.py (explicit-wins, release-dir fallback); run_packager calls it once and threads the resolved path through validation, the metadata's image field, and assembly. New tests/scripts/test_release_common.py covers the four resolution branches; new packager-side tests confirm bare-basename success + metadata field plumbing. COPILOT-3 — outdated; already addressed by self-review fix #8 in commit f2fc4a2. Resolved as already treated; no code change. Verification: 1232/1232 tests pass + 5 gated skips; ruff + mypy clean; hash determinism PASS 67/67; leakage probes rc=0 on every tier; validate_release_candidate --no-rebuild exits 0; BUNDLE_SCHEMA_VERSION unchanged at 5. release/{kaggle,huggingface,huggingface-instructor}/* artifacts regenerated byte-identically. Co-Authored-By: Claude Opus 4.7 --- .agent-plan.md | 1 + scripts/_release_common.py | 31 ++++++ scripts/package_hf_release.py | 31 ++++-- scripts/package_kaggle_release.py | 31 ++++-- tests/scripts/test_package_hf_release.py | 89 +++++++++++++++++ tests/scripts/test_package_kaggle_release.py | 54 ++++++++++ tests/scripts/test_release_common.py | 100 +++++++++++++++++++ 7 files changed, 321 insertions(+), 16 deletions(-) create mode 100644 tests/scripts/test_release_common.py diff --git a/.agent-plan.md b/.agent-plan.md index 6e8c359..6ee4d2d 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -54,6 +54,7 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family - [x] PR 5.2: `scripts/_release_common.py` (new) — extracted shared release-packaging primitives so the Kaggle + HF packagers share one source of truth: `GITHUB_BLOB_BASE`, the `](../foo)` regex, the validation-report blob-URL substitution, `validate_cover_image` (dimension floor; HF reuses Kaggle's 560×280), and `validate_upload_dir_safe` (refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor). `FieldDescriptor` / `ResourceSchema` / dtype maps are intentionally NOT extracted — HF infers schema from parquet via `load_dataset` and does not need a Frictionless declaration. `scripts/package_kaggle_release.py` refactored to import these primitives; PR 5.1's external behaviour and committed `release/kaggle/dataset-metadata.json` are byte-stable across the refactor. - [x] PR 5.2: Upload-dir assembly under `release/huggingface/` and `release/huggingface-instructor/` uses real-file copies (not symlinks; mirrors PR 5.1 — the `datasets` library walks the upload tree and silently skips broken symlinks in some versions). Cover image at `release/dataset-cover-image.png` is reused for the HF thumbnail. `release/huggingface/*` and `release/huggingface-instructor/*` are gitignored except for `README.md` — only the dataset card is committed; the upload tree is reassembled on demand. - [x] PR 5.2: `release/HF_DATASET_CARD.md` (legacy single-file stub) deleted — superseded by the generated `release/huggingface/README.md`. +- [x] PR 5.2 (Copilot review pass): folded Copilot's two real findings on the self-review revision back in before requesting human review. (COPILOT-1) `validate_upload_dir_safe` was only called inside `assemble_upload_dir`, which `--dry-run` skips — a user passing `--huggingface-dir release` (or `.`, etc.) in dry-run mode would write a README into the unsafe path before the safety net fired. Hoisted the check into `run_packager` (both packagers) so it runs BEFORE any mkdir or write; the inner `assemble_upload_dir` call stays as defence-in-depth for callers that bypass `run_packager`. (COPILOT-2) Cover-image path resolution was inconsistent: `validate_cover_image` used `cover_image` as passed, while `assemble_upload_dir` did its own ``release_dir / cover_image.name`` fallback — diverged for bare-basename inputs (false validation failures) and for two-paths-sharing-a-basename inputs (assembler could shadow the explicit path). Added `resolve_cover_image_path` to `scripts/_release_common.py` (explicit-wins, with release-dir fallback for bare basenames); `run_packager` calls it once and threads the resolved path through validation, metadata's `image` field, and assembly so every consumer agrees. (COPILOT-3) outdated docstring: `assemble_upload_dir` no longer claims to write the README it doesn't write — already addressed by self-review fix #8 in commit f2fc4a2; resolved as already treated. Net: 1232/1232 tests pass + 5 gated skips (4 + new resolver coverage in `tests/scripts/test_release_common.py`); ruff + mypy clean; hash determinism PASS 67/67; leakage probes 0/3 reconstruct on every tier; `validate_release_candidate --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5. - [x] PR 5.2 (self-review pass): brutal review of the first revision caught real bugs the reviewer would otherwise have to call out. Fixes folded into the PR before review: (#1) `run_packager` validate→write order — both packagers were writing the README/metadata even when validation failed; the validation gate now early-returns with `errors` populated and zero artifacts on disk; new tests exercise the no-write path on both sides. (#2) Instructor README was inlining the public 3-tier README for a 1-tier dataset; replaced with a dedicated `INSTRUCTOR_BODY` constant that opens by linking to the public dataset, describes only the instructor-specific additions (full-horizon tables, hidden DAG, latent registry, mechanism summary), and uses the single-tier tree block. (#3) `validate_upload_dir_safe` now also rejects strict descendants of `release_dir` (e.g. `--huggingface-dir release/intro` would otherwise rmtree the intro bundle); allow-list keeps the canonical `release/{kaggle,huggingface,huggingface-instructor}` direct-children. (#4) `[publish]` extra in `pyproject.toml` (`datasets>=2.14`, `kaggle>=1.6`) makes the gated `load_dataset()` / Kaggle-CLI tests installable in a single command — closes the "G12.3/G12.4 untested in CI" gap to a one-line install. (#5) Shared-primitives extraction finished: `SOURCE_TREE_BLOCK`, `validate_readme_substitution`, `replace_file`, `replace_dir`, `load_manifest` all moved to `scripts/_release_common.py`; both packagers reduced to imports. (#6) Hand-rolled YAML renderer (60 lines + brittle quoting heuristic) replaced with `yaml.safe_dump` + a 4-line `_IndentedDumper` subclass that forces indent-2 on top-level sequences. (#7) Dead `--owner` / `--dataset-slug` CLI flags removed (PR 7.2 will add them when actually needed). (#8) `assemble_upload_dir` now takes `rendered_readme` as a parameter and writes it itself; the public name no longer lies about producing a complete tree. (#9) `build_config_for_tier` made pure (no I/O); `_assert_tier_dir_exists` does the cheap manifest-stat preflight. (#10) `--default-config` with `--variant=instructor` now errors instead of silently ignoring. (#11) Instructor tree-diagram drops the hardcoded "9 tables" claim. (#13–#16) Visual cleanups (duplicate divider, ruff-split imports, `COVER_IMAGE_FILENAME`-vs-`Path.name` redundancy, speculative comment about HF split rename). (#17) Test cruft removed (unused `tmp_path`, dead `tag_lines`); em-dash YAML round-trip parametrised for the instructor `pretty_name`. Net: 1223/1223 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape). ### Phase 6 — Notebook sequence + adversarial framing diff --git a/scripts/_release_common.py b/scripts/_release_common.py index c57e71e..3a59858 100644 --- a/scripts/_release_common.py +++ b/scripts/_release_common.py @@ -146,6 +146,37 @@ def validate_readme_substitution(release_dir: Path, *, packager_name: str) -> li COVER_IMAGE_MIN_HEIGHT: Final[int] = 280 +def resolve_cover_image_path(cover_image: Path, release_dir: Path) -> Path: + """Resolve the effective cover-image path used by the packagers. + + The packagers run a validator against the cover image and a copy + step against the same image; the earlier draft did the path + resolution in the assembler only, which meant validate + assemble + could disagree about WHICH file to use (Copilot review on PR #72, + addressed in self-review pass #2). + + Resolution rule — explicit-wins, with a release-dir fallback: + + 1. If ``cover_image`` exists at the path given, return it as-is. + 2. Otherwise, if ``release_dir / cover_image.name`` exists, return + that — lets users pass a bare basename when the file lives + under ``release/``. + 3. Otherwise return ``cover_image`` unchanged so the cover-image + validator surfaces a clean ``not found at `` error. + + Two paths sharing a basename DO NOT cause the release-dir copy to + silently shadow the explicit path — the explicit path's existence + is the first check and wins. + """ + + if cover_image.exists(): + return cover_image + fallback = release_dir / cover_image.name + if fallback.exists(): + return fallback + return cover_image + + def validate_cover_image(path: Path) -> list[ValidationError]: """Validate that ``path`` exists and meets the dimension floor.""" diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py index 382bed5..31ca3fd 100644 --- a/scripts/package_hf_release.py +++ b/scripts/package_hf_release.py @@ -63,6 +63,7 @@ load_manifest, replace_dir, replace_file, + resolve_cover_image_path, rewrite_release_links, validate_cover_image, validate_readme_substitution, @@ -685,12 +686,11 @@ def assemble_upload_dir( readme_path.unlink() readme_path.write_text(rendered_readme, encoding="utf-8") - # Cover image — reuse the one ``release/`` already ships. - cover_src = release_dir / cover_image.name - if not cover_src.exists(): - cover_src = cover_image - if cover_src.exists(): - replace_file(cover_src, upload_dir / cover_image.name) + # Cover image — copy the resolved path as-is. Path resolution + # happens once in ``run_packager`` via ``resolve_cover_image_path`` + # so the validator and the assembler agree on which file to use. + if cover_image.exists(): + replace_file(cover_image, upload_dir / cover_image.name) # LICENSE — straight copy, no rewriting. license_src = release_dir / "LICENSE" @@ -754,6 +754,21 @@ def run_packager( if not release_dir.exists(): raise FileNotFoundError(f"release directory not found: {release_dir}") + # Hoist the upload-dir safety check BEFORE any mkdir or write — + # including the dry-run path (Copilot review on PR #72). The + # earlier draft only checked inside ``assemble_upload_dir``, so a + # dry run with ``--huggingface-dir .`` would happily write a + # README into ``cwd`` before the safety guard ever ran. + # ``assemble_upload_dir`` retains its own call as defence-in-depth + # for callers that bypass ``run_packager``. + kind = "huggingface" if variant == "public" else "huggingface-instructor" + validate_upload_dir_safe(huggingface_dir, release_dir, kind=kind) + + # Resolve cover image once — the same path is used for validation + # and assembly so the two cannot disagree (Copilot review on + # PR #72). + resolved_cover = resolve_cover_image_path(cover_image, release_dir) + card = build_card( release_dir, variant=variant, @@ -764,7 +779,7 @@ def run_packager( errors: list[ValidationError] = [] errors.extend(validate_card(card)) - errors.extend(validate_cover_image(cover_image)) + errors.extend(validate_cover_image(resolved_cover)) if variant == "public": # The instructor body doesn't substitute SOURCE_TREE_BLOCK — # it's a self-contained markdown — so the substitution guard @@ -795,7 +810,7 @@ def run_packager( huggingface_dir, variant=variant, rendered_readme=rendered, - cover_image=cover_image, + cover_image=resolved_cover, ) return PackagerOutcome( diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py index c83d5c9..2de9401 100644 --- a/scripts/package_kaggle_release.py +++ b/scripts/package_kaggle_release.py @@ -72,6 +72,7 @@ load_manifest, replace_dir, replace_file, + resolve_cover_image_path, rewrite_release_links, validate_cover_image, validate_readme_substitution, @@ -791,11 +792,11 @@ def assemble_upload_dir( validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle") kaggle_dir.mkdir(parents=True, exist_ok=True) - # Cover image. - cover_src = release_dir / cover_image.name - if not cover_src.exists(): - cover_src = cover_image - replace_file(cover_src, kaggle_dir / cover_image.name) + # Cover image — copy the resolved path as-is. Path resolution + # happens once in ``run_packager`` via ``resolve_cover_image_path`` + # so the validator and the assembler agree on which file to use. + if cover_image.exists(): + replace_file(cover_image, kaggle_dir / cover_image.name) # LICENSE — straight copy, no rewriting. license_src = release_dir / "LICENSE" @@ -868,6 +869,20 @@ def run_packager( if not release_dir.exists(): raise FileNotFoundError(f"release directory not found: {release_dir}") + # Hoist the upload-dir safety check BEFORE any mkdir or write — + # including the dry-run path (Copilot review on PR #72). The + # earlier draft only checked inside ``assemble_upload_dir``, so a + # dry run with ``--kaggle-dir .`` would happily write + # ``dataset-metadata.json`` into ``cwd`` before the safety guard + # ever ran. ``assemble_upload_dir`` retains its own call as + # defence-in-depth for callers that bypass ``run_packager``. + validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle") + + # Resolve cover image once — the same path is used for validation, + # the metadata's ``image`` field, and the upload-tree copy so they + # cannot disagree (Copilot review on PR #72). + resolved_cover = resolve_cover_image_path(cover_image, release_dir) + metadata = build_metadata( release_dir, tiers=tiers, @@ -881,12 +896,12 @@ def run_packager( license_name=license_name, update_frequency=update_frequency, user_sources=user_sources, - cover_image=cover_image, + cover_image=resolved_cover, ) errors: list[ValidationError] = [] errors.extend(validate_metadata(metadata)) - errors.extend(validate_cover_image(cover_image)) + errors.extend(validate_cover_image(resolved_cover)) errors.extend(validate_readme_substitution(release_dir, packager_name="Kaggle")) metadata_path = kaggle_dir / "dataset-metadata.json" @@ -909,7 +924,7 @@ def run_packager( metadata_path.write_text(render_metadata_json(metadata), encoding="utf-8") if not dry_run: - assemble_upload_dir(release_dir, kaggle_dir, tiers=tiers, cover_image=cover_image) + assemble_upload_dir(release_dir, kaggle_dir, tiers=tiers, cover_image=resolved_cover) return PackagerOutcome( metadata=metadata, diff --git a/tests/scripts/test_package_hf_release.py b/tests/scripts/test_package_hf_release.py index 0d0a9ea..f5bfc2a 100644 --- a/tests/scripts/test_package_hf_release.py +++ b/tests/scripts/test_package_hf_release.py @@ -442,6 +442,95 @@ def test_main_reports_missing_release_dir( assert "release directory not found" in captured.err +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_rejects_unsafe_huggingface_dir_in_dry_run(tmp_path: Path) -> None: + """Copilot review on PR #72 item #1. + + The earlier draft only checked ``--huggingface-dir`` safety + inside ``assemble_upload_dir``, which the dry-run path skips. + A user passing ``--huggingface-dir release`` (i.e. ``release_dir`` + itself) in dry-run mode would write a README into ``release/`` + before the safety net fired. + + With the hoisted check, dry-run mode also raises ``ValueError`` + BEFORE any mkdir or write. + """ + + sentinel = _RELEASE_DIR / "README.md" + pre = sentinel.read_bytes() if sentinel.exists() else None + + with pytest.raises(ValueError, match="unsafe"): + packager.run_packager( + _RELEASE_DIR, + huggingface_dir=_RELEASE_DIR, # release_dir itself — unsafe + cover_image=_COMMITTED_COVER, + dry_run=True, + ) + + # The release README must not have been clobbered by a write that + # raced past the (previously absent) safety check. + if pre is not None: + assert sentinel.read_bytes() == pre + + +def test_main_rejects_unsafe_huggingface_dir_via_cli( + tmp_path: Path, capsys: pytest.CaptureFixture[str] +) -> None: + """Same as the test above but via the ``main()`` CLI entry point. + + Confirms the CLI returns rc=2 for an unsafe ``--huggingface-dir`` + in dry-run mode (the FileNotFoundError / ValueError handler in + ``main`` catches the hoisted ValueError). + """ + + fake_release = tmp_path / "release" + fake_release.mkdir() + rc = packager.main( + [ + "--release-dir", + str(fake_release), + "--huggingface-dir", + str(fake_release), # release_dir itself + "--cover-image", + str(_COMMITTED_COVER) if _COMMITTED_COVER.exists() else str(tmp_path / "cover.png"), + "--dry-run", + ] + ) + captured = capsys.readouterr() + assert rc == 2 + assert "unsafe" in captured.err + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_resolves_cover_image_via_release_fallback(tmp_path: Path) -> None: + """Copilot review on PR #72 item #2. + + A user passing a bare basename that exists under ``release/`` but + not at cwd should succeed: ``resolve_cover_image_path`` falls back + to the release-dir sibling, and the validator + assembler both + use that resolved path. + """ + + upload_dir = tmp_path / "huggingface" + bare = Path(_COMMITTED_COVER.name) # bare basename + + # Sanity: the bare path doesn't exist at cwd (or wherever pytest + # runs from); it MUST resolve via the release-dir fallback. + assert not bare.is_absolute() + + outcome = packager.run_packager( + _RELEASE_DIR, + huggingface_dir=upload_dir, + cover_image=bare, + dry_run=True, + ) + assert outcome.errors == () + # The README mentions the cover image filename — a smoke check + # that the resolved path went through the rest of the pipeline. + written = (upload_dir / "README.md").read_text(encoding="utf-8") + assert _COMMITTED_COVER.name in written + + @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None: """Validation failure must NOT leave a corrupt README on disk diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py index 3790b5f..8140c23 100644 --- a/tests/scripts/test_package_kaggle_release.py +++ b/tests/scripts/test_package_kaggle_release.py @@ -368,6 +368,60 @@ def test_assemble_upload_dir_idempotent_against_existing_tree(tmp_path: Path) -> # --------------------------------------------------------------------------- +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_rejects_unsafe_kaggle_dir_in_dry_run(tmp_path: Path) -> None: + """Copilot review on PR #72 item #1. + + The earlier draft only checked ``--kaggle-dir`` safety inside + ``assemble_upload_dir``, which dry-run skips. A user passing + ``--kaggle-dir release`` (i.e. ``release_dir`` itself) in dry-run + would write ``dataset-metadata.json`` into ``release/`` before + the safety net fired. With the hoisted check, dry-run also + raises ``ValueError`` BEFORE any mkdir or write. + """ + + cover = tmp_path / "cover.png" + Image.new("RGB", (1280, 640), (0, 0, 0)).save(cover) + + with pytest.raises(ValueError, match="unsafe"): + packager.run_packager( + _RELEASE_DIR, + kaggle_dir=_RELEASE_DIR, # release_dir itself + cover_image=cover, + dry_run=True, + ) + + # ``release/dataset-metadata.json`` must not exist at the top + # level (it's gitignored anyway, but a stray write would still + # show up in ``git status``). + assert not (_RELEASE_DIR / "dataset-metadata.json").exists() + + +@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") +def test_run_packager_resolves_cover_image_via_release_fallback(tmp_path: Path) -> None: + """Copilot review on PR #72 item #2. + + A bare-basename ``--cover-image`` that exists under ``release/`` + must validate (via ``resolve_cover_image_path`` fallback) and + materialise into the assembled metadata's ``image`` field. + """ + + out_dir = tmp_path / "kaggle" + bare = Path(_COMMITTED_COVER.name) + assert not bare.is_absolute() + + outcome = packager.run_packager( + _RELEASE_DIR, + kaggle_dir=out_dir, + cover_image=bare, + dry_run=True, + ) + assert outcome.errors == () + # Metadata's ``image`` field carries the resolved filename. + parsed = json.loads((out_dir / "dataset-metadata.json").read_text(encoding="utf-8")) + assert parsed["image"] == _COMMITTED_COVER.name + + @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present") def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None: """A failed validation must NOT leave a corrupt metadata file on diff --git a/tests/scripts/test_release_common.py b/tests/scripts/test_release_common.py new file mode 100644 index 0000000..23bbbbf --- /dev/null +++ b/tests/scripts/test_release_common.py @@ -0,0 +1,100 @@ +"""Tests for ``scripts/_release_common.py`` shared release primitives. + +The link-rewriter, source-tree-block guard, cover-image validator, +and upload-dir safety check have packager-side coverage in +``test_package_kaggle_release.py`` and ``test_package_hf_release.py``. +This file covers shared helpers that don't have a natural packager- +side home — currently the cover-image path resolver introduced in +response to the Copilot review on PR #72. +""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + +from PIL import Image + +_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "_release_common.py" +_spec = importlib.util.spec_from_file_location("_release_common", _SCRIPT_PATH) +assert _spec is not None +assert _spec.loader is not None +common = importlib.util.module_from_spec(_spec) +sys.modules["_release_common"] = common +_spec.loader.exec_module(common) + + +def _make_cover(path: Path) -> None: + """Write a minimum-Kaggle-acceptable cover image at ``path``.""" + + Image.new("RGB", (1280, 640), (0, 0, 0)).save(path) + + +# --------------------------------------------------------------------------- +# resolve_cover_image_path — Copilot review on PR #72, item #2 +# --------------------------------------------------------------------------- + + +def test_resolve_cover_image_path_returns_explicit_when_it_exists(tmp_path: Path) -> None: + """An explicit path that exists is returned unchanged.""" + + release = tmp_path / "release" + release.mkdir() + explicit = tmp_path / "explicit.png" + _make_cover(explicit) + + resolved = common.resolve_cover_image_path(explicit, release) + assert resolved == explicit + + +def test_resolve_cover_image_path_falls_back_to_release_dir(tmp_path: Path) -> None: + """A bare basename that doesn't exist resolves to ``release_dir/``.""" + + release = tmp_path / "release" + release.mkdir() + cover_in_release = release / "dataset-cover-image.png" + _make_cover(cover_in_release) + + resolved = common.resolve_cover_image_path(Path("dataset-cover-image.png"), release) + assert resolved == cover_in_release + + +def test_resolve_cover_image_path_prefers_explicit_over_release_sibling( + tmp_path: Path, +) -> None: + """When BOTH the explicit path and a release-dir sibling exist, the + explicit path wins. Locks down the contract that two paths sharing + a basename do not silently shadow each other (Copilot review #2). + """ + + release = tmp_path / "release" + release.mkdir() + explicit = tmp_path / "explicit.png" + _make_cover(explicit) + decoy = release / "explicit.png" + _make_cover(decoy) + + resolved = common.resolve_cover_image_path(explicit, release) + assert resolved == explicit + assert resolved != decoy + + +def test_resolve_cover_image_path_returns_input_when_neither_exists( + tmp_path: Path, +) -> None: + """When neither location resolves, return the input unchanged so + the cover-image validator can surface a clean ``not found at `` + error (rather than this helper raising).""" + + release = tmp_path / "release" + release.mkdir() + missing = tmp_path / "nope.png" + + resolved = common.resolve_cover_image_path(missing, release) + assert resolved == missing + # Sanity: the validator's not-found path actually fires for this + # input. + errors = common.validate_cover_image(resolved) + assert errors + assert "not found" in errors[0].message