From 3acc3c20cb6c44a561c5b91096ce42af873e7994 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Wed, 6 May 2026 22:20:40 +0300
Subject: [PATCH 1/3] PR 5.2: HuggingFace release packager + load_dataset smoke
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `scripts/package_hf_release.py` to generate `release/huggingface/README.md`
with G12.1-compliant YAML frontmatter (pretty_name, license, language,
task_categories, size_categories, tags, three configs with `default: true`
on intermediate per G12.2), inlining the rewritten `release/README.md`
body with HF-specific link rewrites.  `--variant=instructor` packages the
companion repo (G12.4) from `release/intermediate_instructor/` into a
separate `release/huggingface-instructor/` upload tree.  G12.3 covered
by a parametrised `load_dataset()` smoke test gated on the optional
`datasets` SDK.

Extract shared release-packaging primitives (link rewriter, dir-safety
guard, cover-image validator) into `scripts/_release_common.py`; refactor
the Kaggle packager to import them.  `release/kaggle/dataset-metadata.json`
is byte-stable across the refactor.

Delete the legacy `release/HF_DATASET_CARD.md` stub — superseded by the
generated card.  Gitignore `release/huggingface{,-instructor}/*` except
the committed README.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .agent-plan.md                               |   7 +-
 .gitignore                                   |   9 +
 release/HF_DATASET_CARD.md                   | 110 ---
 release/huggingface-instructor/README.md     | 261 ++++++
 release/huggingface/README.md                | 277 ++++++
 scripts/_release_common.py                   | 133 +++
 scripts/package_hf_release.py                | 852 +++++++++++++++++++
 scripts/package_kaggle_release.py            | 107 +--
 tests/scripts/test_package_hf_release.py     | 557 ++++++++++++
 tests/scripts/test_package_kaggle_release.py |   2 +-
 10 files changed, 2133 insertions(+), 182 deletions(-)
 delete mode 100644 release/HF_DATASET_CARD.md
 create mode 100644 release/huggingface-instructor/README.md
 create mode 100644 release/huggingface/README.md
 create mode 100644 scripts/_release_common.py
 create mode 100644 scripts/package_hf_release.py
 create mode 100644 tests/scripts/test_package_hf_release.py

diff --git a/.agent-plan.md b/.agent-plan.md
index 89a3dfe..874e7be 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -50,8 +50,11 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family
 - [x] PR 5.1: `scripts/generate_cover_image.py` (new) — deterministic Pillow + DejaVu Sans (bundled with matplotlib) renderer producing `release/dataset-cover-image.png` at 1280×640 (well above the 560×280 minimum, 2:1 aspect for Kaggle's header crop). Three-tier card design surfacing the cross-seed median conversion rate + LR AUC for each tier, pinned from `release/validation/validation_report.md`. Byte-identical re-runs guarded by `tests/scripts/test_generate_cover_image.py`.
 - [x] PR 5.1: Upload-dir assembly under `release/kaggle/` uses relative symlinks for the heavy bundle directories + cover image + LICENSE, plus a real file copy for `README.md` (rewritten on the way in so its `../` links and tree diagram render correctly on the Kaggle dataset page). `_validate_kaggle_dir_safe` refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor. `release/kaggle/*` is gitignored except for `dataset-metadata.json` itself — only the metadata is committed; the upload tree is regenerated on demand.
 - [x] PR 5.1: 19 new tests (`tests/scripts/test_package_kaggle_release.py` × 15, `tests/scripts/test_generate_cover_image.py` × 4): every Kaggle field constraint, schema field order parity for CSV + parquet, README rewriting (tree + `../` + validation report links), unsafe-kaggle-dir rejection, CLI rc=2 on missing release dir, byte-determinism (audit-artifact-sync), and committed-metadata-matches-fresh-regeneration sync check. 1194/1194 tests pass; ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape).
-- [ ] PR 5.2: `scripts/package_hf_release.py` → `release/huggingface/README.md` with YAML configs/default/pretty_name/tags
-- [ ] PR 5.2: Local `load_dataset()` smoke test; Kaggle dry-run package validation
+- [x] PR 5.2: `scripts/package_hf_release.py` (new) — Hugging Face release packager.  Reads each public tier's `manifest.json` under `release/`, emits `release/huggingface/README.md` with YAML frontmatter satisfying G12.1 (`pretty_name`, `license: mit`, `language: [en]`, `task_categories: [tabular-classification]`, `size_categories: [1K<n<10K]` per tier, sorted `tags: [b2b, crm, datasets, lead-scoring, pandas, synthetic-data, tabular]`, three `configs` blocks pointing at the parquet task splits in the assembled upload tree).  G12.2 wired in: `intermediate` is the only config with `default: true`, locked by `validate_card`.  Body inlines the rewritten `release/README.md` (PR 4.1) with HF-specific link rewrites: source-repo tree diagram → upload-tree diagram, `](../foo)` → GitHub blob URL, `](validation/validation_report.md)` → GitHub blob URL.  Hand-rolled YAML renderer (PyYAML's default flow style collapses `configs[]` onto one line, which the HF viewer rejects).  `--variant=instructor` packages the companion repo (G12.4): `release/intermediate_instructor/` flattens to `intermediate/` in the upload tree, single-config card, separate `release/huggingface-instructor/` output dir.  CLI: `--release-dir`, `--huggingface-dir`, `--variant`, `--default-config`, `--owner`, `--dataset-slug`, `--cover-image`, `--dry-run`.  Exit codes: 0 pass / 1 validation failure / 2 pre-flight error.
+- [x] PR 5.2: `scripts/_release_common.py` (new) — extracted shared release-packaging primitives so the Kaggle + HF packagers share one source of truth: `GITHUB_BLOB_BASE`, the `](../foo)` regex, the validation-report blob-URL substitution, `validate_cover_image` (dimension floor; HF reuses Kaggle's 560×280), and `validate_upload_dir_safe` (refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor).  `FieldDescriptor` / `ResourceSchema` / dtype maps are intentionally NOT extracted — HF infers schema from parquet via `load_dataset` and does not need a Frictionless declaration.  `scripts/package_kaggle_release.py` refactored to import these primitives; PR 5.1's external behaviour and committed `release/kaggle/dataset-metadata.json` are byte-stable across the refactor.
+- [x] PR 5.2: Upload-dir assembly under `release/huggingface/` and `release/huggingface-instructor/` uses real-file copies (not symlinks; mirrors PR 5.1 — the `datasets` library walks the upload tree and silently skips broken symlinks in some versions).  Cover image at `release/dataset-cover-image.png` is reused for the HF thumbnail.  `release/huggingface/*` and `release/huggingface-instructor/*` are gitignored except for `README.md` — only the dataset card is committed; the upload tree is reassembled on demand.
+- [x] PR 5.2: `release/HF_DATASET_CARD.md` (legacy single-file stub) deleted — superseded by the generated `release/huggingface/README.md`.
+- [x] PR 5.2: 21 new tests (`tests/scripts/test_package_hf_release.py`): every G12.1 YAML field, G12.2 exactly-one-default invariant, YAML round-trip via `yaml.safe_load`, tags-sorted-at-render-time, README rewriting (tree + `../` + validation report links), `HF_TREE_BLOCK_SOURCE` silent-failure guard, unsafe-dir rejection (release dir / parent / `cwd`), assembled-tree resource resolution, idempotent re-assembly, byte-determinism (audit-artifact-sync), committed-README-matches-fresh-regeneration sync check, instructor-variant packaging (G12.4: single config, flattened source dir), and a parametrised `load_dataset()` round-trip per config gated on the optional `datasets` SDK with row-count assertions (3500 / 750 / 750 train / valid / test) covering G12.3 + G12.4.  1216/1216 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape).
 
 ### Phase 6 — Notebook sequence + adversarial framing
 - [ ] `release/notebooks/{02_relational_feature_engineering,03_leakage_and_time_windows,04_lift_calibration_value_ranking}.ipynb`
diff --git a/.gitignore b/.gitignore
index 35abd8e..a991601 100644
--- a/.gitignore
+++ b/.gitignore
@@ -224,3 +224,12 @@ release/_release_quality/
 # scripts/package_kaggle_release.py from release/{intro,intermediate,advanced}/.
 release/kaggle/*
 !release/kaggle/dataset-metadata.json
+
+# Generated HuggingFace upload tree (PR 5.2) — only README.md is
+# committed; the rest is reassembled on demand via
+# scripts/package_hf_release.py.  Same pattern for the instructor
+# companion under release/huggingface-instructor/.
+release/huggingface/*
+!release/huggingface/README.md
+release/huggingface-instructor/*
+!release/huggingface-instructor/README.md
diff --git a/release/HF_DATASET_CARD.md b/release/HF_DATASET_CARD.md
deleted file mode 100644
index b70dcc9..0000000
--- a/release/HF_DATASET_CARD.md
+++ /dev/null
@@ -1,110 +0,0 @@
----
-language:
-  - en
-license: mit
-task_categories:
-  - tabular-classification
-tags:
-  - lead-scoring
-  - b2b
-  - crm
-  - synthetic
-  - relational
-  - sales
-  - funnel
-  - binary-classification
-  - reproducible
-size_categories:
-  - 1K-10K
-configs:
-  - config_name: intro
-    data_files:
-      - split: train
-        path: intro/tasks/converted_within_90_days/train.parquet
-      - split: validation
-        path: intro/tasks/converted_within_90_days/valid.parquet
-      - split: test
-        path: intro/tasks/converted_within_90_days/test.parquet
-  - config_name: intermediate
-    data_files:
-      - split: train
-        path: intermediate/tasks/converted_within_90_days/train.parquet
-      - split: validation
-        path: intermediate/tasks/converted_within_90_days/valid.parquet
-      - split: test
-        path: intermediate/tasks/converted_within_90_days/test.parquet
-  - config_name: advanced
-    data_files:
-      - split: train
-        path: advanced/tasks/converted_within_90_days/train.parquet
-      - split: validation
-        path: advanced/tasks/converted_within_90_days/valid.parquet
-      - split: test
-        path: advanced/tasks/converted_within_90_days/test.parquet
----
-
-# LeadForge: Synthetic B2B Lead Scoring Dataset
-
-A relational, reproducible, multi-difficulty lead scoring dataset generated by [leadforge](https://github.com/leadforge-dev/leadforge) -- an open-source Python framework for synthetic CRM/funnel data.
-
-## Why this dataset?
-
-1. **Relational structure.** 9 normalized tables plus ML-ready task splits. Practice feature engineering from raw tables, or grab the flat file and start modeling.
-2. **Three difficulty tiers.** Same world, different conversion rates, signal-to-noise ratios, and missingness.
-3. **Reproducible and leakage-safe.** Deterministic generation (seed 42), SHA-256 hashes. Event-aggregate features are computed over a 30-day window (`manifest.snapshot_day == 30`) while the label resolves over 90 days, so features cannot encode post-conversion events. The label-encoding `current_stage` column is stripped from public bundles via the exposure layer (`redacted_columns` in `manifest.json` lists what was removed). The deliberately included `total_touches_all` leakage trap — counting touches over the full 90-day horizon — is retained as a teaching feature.
-
-## Quick start
-
-```python
-from datasets import load_dataset
-
-# Load intermediate difficulty
-ds = load_dataset("leadforge/leadforge-b2b-lead-scoring", name="intermediate")
-train = ds["train"].to_pandas()
-valid = ds["validation"].to_pandas()  # Note: file is valid.parquet, split name is "validation"
-test = ds["test"].to_pandas()
-```
-
-Or use the flat CSV:
-
-```python
-import pandas as pd
-df = pd.read_csv("hf://datasets/leadforge/leadforge-b2b-lead-scoring/intermediate/lead_scoring.csv")
-```
-
-## Dataset summary
-
-| | Intro | Intermediate | Advanced |
-|---|---|---|---|
-| Leads | 5,000 | 5,000 | 5,000 |
-| Features | 30 + 1 trap (+ 1 target) | 30 + 1 trap (+ 1 target) | 30 + 1 trap (+ 1 target) |
-| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |
-| Conversion rate | 41.5% | 20.1% | 7.9% |
-| Signal strength | 0.90 | 0.70 | 0.50 |
-| Noise scale | 0.10 | 0.30 | 0.55 |
-| Missing rate | 2% | 8% | 18% |
-
-## The scenario
-
-**Veridian Technologies** sells cloud procurement automation to mid-market firms (200-2,000 employees). Sales channels: inbound (45%), SDR outbound (35%), partner referrals (20%). Four buyer personas. **Task:** predict conversion within 90 days.
-
-## Relational tables
-
-Each difficulty tier includes 9 Parquet tables under `tables/`: accounts, contacts, leads, touches, sessions, sales_activities, opportunities, customers, subscriptions. These form a normalized CRM schema linked by foreign keys.
-
-## Leakage handling (bundle schema v4)
-
-Two complementary mechanisms keep the published feature set leakage-safe:
-
-- **Windowed snapshot.** Every event-aggregate feature (`touch_count`, `session_count`, `pricing_page_views`, `expected_acv`, `days_since_last_touch`, ...) is computed over the first 30 days only (`manifest.snapshot_day == 30`). The label resolves over the full 90 days (`manifest.label_window_days == 90`). So features cannot accidentally encode events that occur after a lead converts.
-- **Column redaction.** `current_stage` (label-encoding at the 90-day horizon) and `is_sql` (P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across tiers — near-deterministic for non-conversion) are stripped from `student_public` bundles entirely, including from `tables/leads.parquet`. Both are still available in `intermediate_instructor/`. `manifest.redacted_columns` lists exactly what was removed.
-- **Removed entirely:** `is_mql` (constant `True` in the simulator — zero variance, no information).
-- **Deliberately retained as a pedagogical trap:** `total_touches_all` counts touches over the full 90-day horizon — the only feature that crosses the snapshot window. Flagged `leakage_risk=True` in `feature_dictionary.csv`. Use it as an exercise: train with and without, compare AUC, explain the gap.
-
-## Research companion
-
-`intermediate_instructor/` includes the full causal structure: world graph (DAG), latent trait registry, and mechanism assignments.
-
-## Provenance
-
-Generated by [leadforge](https://github.com/leadforge-dev/leadforge) v1.0.0, recipe `b2b_saas_procurement_v1`, seed 42. MIT license. See `manifest.json` in each bundle for SHA-256 hashes.
diff --git a/release/huggingface-instructor/README.md b/release/huggingface-instructor/README.md
new file mode 100644
index 0000000..3fadd54
--- /dev/null
+++ b/release/huggingface-instructor/README.md
@@ -0,0 +1,261 @@
+---
+pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion"
+license: mit
+language:
+  - en
+task_categories:
+  - tabular-classification
+size_categories:
+  - 1K<n<10K
+tags:
+  - b2b
+  - crm
+  - datasets
+  - lead-scoring
+  - pandas
+  - synthetic-data
+  - tabular
+configs:
+  - config_name: intermediate
+    default: true
+    data_files:
+      - split: train
+        path: intermediate/tasks/converted_within_90_days/train.parquet
+      - split: validation
+        path: intermediate/tasks/converted_within_90_days/valid.parquet
+      - split: test
+        path: intermediate/tasks/converted_within_90_days/test.parquet
+---
+
+# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`)
+
+A relational, reproducible, three-tier synthetic CRM dataset family for
+teaching lead scoring at scale. Generated by
+[leadforge](https://github.com/leadforge-dev/leadforge), an
+open-source Python framework for synthetic CRM/funnel data. The
+framework version is decoupled from the dataset version: the package
+stays at `1.x`; the dataset is published under the explicit `…-v1`
+tag.
+
+## Why lead scoring matters in 2024–2026
+
+Mid-market SaaS vendors entered 2024–2026 with growth slowing and
+customer-acquisition costs rising[^macro], so predicting *which* leads
+convert within a fixed window has moved from a marketing nicety to a
+survival skill. This dataset teaches that skill on a relational
+substrate, with the realistic confusions (snapshot-window discipline,
+leakage traps, channel signal weaker than vendor blogs imply) that
+students will hit when they finally get hands on real CRM data.
+
+[^macro]: Macroeconomic framing summarised in
+[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md)
+(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio
+rose materially in 2024).
+
+## What's inside
+
+```
+.
+├── intermediate/                     # research_instructor companion: full-horizon
+│   ├── manifest.json                 # provenance + file hashes
+│   ├── dataset_card.md               # auto-rendered per-bundle card
+│   ├── feature_dictionary.csv        # authoritative column spec
+│   ├── tables/*.parquet              # 9 full-horizon tables (incl. customers, subscriptions)
+│   ├── tasks/converted_within_90_days/{train,valid,test}.parquet
+│   └── metadata/                     # world_spec, graph.{graphml,json}, latent_registry, etc.
+├── README.md                         # this file (HF dataset card)
+├── dataset-cover-image.png           # dataset thumbnail
+└── LICENSE
+```
+
+`student_public` bundles ship the snapshot-safe relational view;
+`research_instructor` companions ship the full-horizon view plus the
+hidden causal structure (DAG, latent registry, mechanism summary)
+under `metadata/`. The full layout is documented in each bundle's
+`manifest.json`.
+
+## Quick start
+
+```python
+# Flat CSV
+df = pd.read_csv("intermediate/lead_scoring.csv")
+
+# Parquet task splits (recommended)
+train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet")
+test  = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet")
+
+# Relational tables (feature engineering — example)
+leads   = pd.read_parquet("intermediate/tables/leads.parquet")
+touches = pd.read_parquet("intermediate/tables/touches.parquet")
+my_touch_count = (
+    touches.groupby("lead_id").size().rename("my_touch_count").reset_index()
+)
+features = leads.merge(my_touch_count, on="lead_id", how="left")
+
+# Reproduce from source
+# pip install leadforge
+# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \
+#                    --mode student_public --difficulty intermediate --out my_bundle
+```
+
+The label `converted_within_90_days` resolves over a 90-day window;
+engagement features (`touch_count`, `session_count`, etc.) are
+computed strictly over events on days `[0, 30]`. The deliberate
+exception is `total_touches_all`, the leakage trap — flagged
+`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your
+feature set unless you're demonstrating leakage detection.
+
+## Dataset summary
+
+| | Intro | Intermediate | Advanced |
+|---|---|---|---|
+| Leads | 5,000 | 5,000 | 5,000 |
+| Accounts | 1,500 | 1,500 | 1,500 |
+| Contacts | 4,200 | 4,200 | 4,200 |
+| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* |
+| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |
+| Conversion rate (recipe band) | 24–61% | 12–31% | 4–12% |
+| Conversion rate (median, seeds 42–46) | 42.67% | 21.60% | 8.40% |
+| Signal strength | 0.90 | 0.70 | 0.50 |
+| Noise scale | 0.10 | 0.30 | 0.55 |
+| Missing rate | 2% | 8% | 18% |
+
+\* `student_public` / `research_instructor`. Difficulty is modulated
+by the simulation engine — signal strength on latent-trait weights,
+Gaussian noise on float features, MCAR missingness, outlier rate —
+not post-hoc label flipping.
+
+## The scenario
+
+**Veridian Technologies** is a fictional Series B startup (Austin, US)
+selling **Veridian Procure**, a procurement / AP automation SaaS, to
+mid-market firms (200–2,000 employees) in the US and UK. The funnel
+runs through inbound marketing (45%), SDR outbound (35%), and
+partner referrals (20%); four personas drive deals (VP Finance, AP
+Manager, IT Director, Procurement Manager). **Task:** predict whether
+a lead converts (`closed_won`) within 90 days. ACV bands are
+$18k–$120k. See
+[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md)
+for the full DGP, and the deeper "what's modelled / approximate / not
+modelled" breakdown that this README only summarises.
+
+## Public vs instructor: what's redacted
+
+Filtering happens **during rendering**, not during simulation. The
+redaction contract is single-sourced in
+[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py);
+the snapshot-safe writer and the validator import the same constants,
+so they cannot drift apart.
+
+| Source-of-truth constant | Public bundle treatment |
+|---|---|
+| `BANNED_LEAD_COLUMNS = ("converted_within_90_days", "conversion_timestamp")` | Dropped from `tables/leads.parquet` |
+| `BANNED_OPP_COLUMNS = ("close_outcome", "closed_at")` | Dropped from `tables/opportunities.parquet` |
+| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles |
+| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` |
+| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` |
+| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` |
+
+Each bundle's `manifest.json` records `relational_snapshot_safe`,
+`redacted_columns`, and `snapshot_day`, so the bundle is
+self-describing.
+
+## Calibration
+
+Every realism / calibration / difficulty claim in this README is
+backed by
+[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md),
+regenerated by
+[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py)
+with bands declared in
+[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml).
+Headline cross-seed medians (seeds 42–46):
+
+| Tier | LR AUC | AP | P@100 | Brier |
+|---|---|---|---|---|
+| intro | 0.879 | 0.761 | 0.80 | 0.130 |
+| intermediate | 0.886 | 0.575 | 0.59 | 0.110 |
+| advanced | 0.886 | 0.351 | 0.34 | 0.061 |
+
+AP, P@100, conversion-rate, and lift orderings hold across the
+intended difficulty axis (intro > intermediate > advanced).
+
+## Intended uses
+
+- Teaching baseline lead-scoring on a flat snapshot.
+- Teaching relational feature engineering against snapshot-safe tables.
+- Teaching leakage detection (the `total_touches_all` trap is
+  designed to be discoverable).
+- Teaching calibration, lift, P@K, value-aware ranking
+  (`expected_acv × P(convert)`), and cohort-shift evaluation.
+- Comparing model families under a controlled DGP.
+
+## Out-of-scope uses
+
+- **Production lead scoring.** The company, product, and customers are
+  fictional.
+- **Vendor benchmarking / paper baselines.** Difficulty tiers are
+  calibrated for pedagogy, not cross-paper comparability.
+- **Causal-inference research that requires recovery of the true DGP.**
+  The instructor companion exposes the hidden graph for teaching, not
+  designed counterfactuals.
+- **Demographic / fairness research.** v1 does not model protected
+  attributes.
+
+## Known limitations
+
+- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across
+  every tier. Difficulty is visible in AP, P@K, Brier, and value
+  capture. Treat AUC as a sanity check, not a difficulty signal.
+- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta
+  is slightly negative in every tier (intro −0.0045, intermediate
+  −0.0072, advanced −0.0133); v1's snapshot is dominated by linear
+  features. v2 will inject non-linear interactions in the simulator.
+- **Channel signal is weak.** Per
+  [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md),
+  out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across
+  all tiers and the per-channel rate spread is ≤0.05. The simulator
+  does not encode channel-conditional probabilities; channel-conditional
+  encoding is post-v1 work.
+- **Cohort-shift degradation is small.** v1 has no time-of-year drift
+  baked in; the cohort-shift gate (G6.4) is informational and will
+  bite in v2.
+
+## Composition
+
+- **Entities.** Accounts, contacts, leads, touches, sessions,
+  sales_activities, opportunities (public); plus customers and
+  subscriptions (instructor only). Per-row counts per bundle live in
+  `manifest.json`.
+- **Features.** 32 public columns grouped by analytical role in
+  [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md);
+  the per-bundle `feature_dictionary.csv` is the authoritative
+  machine-readable spec.
+- **Label.** `converted_within_90_days` (boolean), event-derived from
+  the simulator. Never sampled directly.
+- **Splits.** 70/15/15 train/valid/test, deterministic given seed;
+  recorded in `tasks/converted_within_90_days/task_manifest.json`.
+- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package
+  version stamped in `manifest.json`.
+
+## Maintenance, adversarial framing, license
+
+We *want* the dataset to be broken. Issue templates ship under
+`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as
+`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships,
+`docs/release/v2_decision_log.md` will track every accepted finding
+and the design call that came from it. File issues at
+[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);
+PRs welcome.
+
+| Field | Value |
+|---|---|
+| Generator | leadforge `1.0.0+` |
+| Recipe | `b2b_saas_procurement_v1` |
+| Canonical seed | 42 (cross-seed sweep: 42–46) |
+| Bundle schema version | 5 |
+| Format | Parquet (canonical) + CSV (convenience) |
+| License | MIT — see [LICENSE](LICENSE) |
+
+Verify integrity with `leadforge validate <bundle_dir>`; every file
+is hashed in `manifest.json`.
diff --git a/release/huggingface/README.md b/release/huggingface/README.md
new file mode 100644
index 0000000..cdbcf05
--- /dev/null
+++ b/release/huggingface/README.md
@@ -0,0 +1,277 @@
+---
+pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1)"
+license: mit
+language:
+  - en
+task_categories:
+  - tabular-classification
+size_categories:
+  - 1K<n<10K
+tags:
+  - b2b
+  - crm
+  - datasets
+  - lead-scoring
+  - pandas
+  - synthetic-data
+  - tabular
+configs:
+  - config_name: intro
+    data_files:
+      - split: train
+        path: intro/tasks/converted_within_90_days/train.parquet
+      - split: validation
+        path: intro/tasks/converted_within_90_days/valid.parquet
+      - split: test
+        path: intro/tasks/converted_within_90_days/test.parquet
+  - config_name: intermediate
+    default: true
+    data_files:
+      - split: train
+        path: intermediate/tasks/converted_within_90_days/train.parquet
+      - split: validation
+        path: intermediate/tasks/converted_within_90_days/valid.parquet
+      - split: test
+        path: intermediate/tasks/converted_within_90_days/test.parquet
+  - config_name: advanced
+    data_files:
+      - split: train
+        path: advanced/tasks/converted_within_90_days/train.parquet
+      - split: validation
+        path: advanced/tasks/converted_within_90_days/valid.parquet
+      - split: test
+        path: advanced/tasks/converted_within_90_days/test.parquet
+---
+
+# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`)
+
+A relational, reproducible, three-tier synthetic CRM dataset family for
+teaching lead scoring at scale. Generated by
+[leadforge](https://github.com/leadforge-dev/leadforge), an
+open-source Python framework for synthetic CRM/funnel data. The
+framework version is decoupled from the dataset version: the package
+stays at `1.x`; the dataset is published under the explicit `…-v1`
+tag.
+
+## Why lead scoring matters in 2024–2026
+
+Mid-market SaaS vendors entered 2024–2026 with growth slowing and
+customer-acquisition costs rising[^macro], so predicting *which* leads
+convert within a fixed window has moved from a marketing nicety to a
+survival skill. This dataset teaches that skill on a relational
+substrate, with the realistic confusions (snapshot-window discipline,
+leakage traps, channel signal weaker than vendor blogs imply) that
+students will hit when they finally get hands on real CRM data.
+
+[^macro]: Macroeconomic framing summarised in
+[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md)
+(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio
+rose materially in 2024).
+
+## What's inside
+
+```
+.
+├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
+│   ├── manifest.json                 # provenance + file hashes
+│   ├── dataset_card.md               # auto-rendered per-bundle card
+│   ├── feature_dictionary.csv        # authoritative column spec
+│   ├── lead_scoring.csv              # flat convenience CSV (all splits)
+│   ├── tables/*.parquet              # 7 snapshot-safe relational tables
+│   └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── README.md                         # this file (HF dataset card)
+├── dataset-cover-image.png           # dataset thumbnail
+└── LICENSE
+```
+
+`student_public` bundles ship the snapshot-safe relational view;
+`research_instructor` companions ship the full-horizon view plus the
+hidden causal structure (DAG, latent registry, mechanism summary)
+under `metadata/`. The full layout is documented in each bundle's
+`manifest.json`.
+
+## Quick start
+
+```python
+# Flat CSV
+df = pd.read_csv("intermediate/lead_scoring.csv")
+
+# Parquet task splits (recommended)
+train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet")
+test  = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet")
+
+# Relational tables (feature engineering — example)
+leads   = pd.read_parquet("intermediate/tables/leads.parquet")
+touches = pd.read_parquet("intermediate/tables/touches.parquet")
+my_touch_count = (
+    touches.groupby("lead_id").size().rename("my_touch_count").reset_index()
+)
+features = leads.merge(my_touch_count, on="lead_id", how="left")
+
+# Reproduce from source
+# pip install leadforge
+# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \
+#                    --mode student_public --difficulty intermediate --out my_bundle
+```
+
+The label `converted_within_90_days` resolves over a 90-day window;
+engagement features (`touch_count`, `session_count`, etc.) are
+computed strictly over events on days `[0, 30]`. The deliberate
+exception is `total_touches_all`, the leakage trap — flagged
+`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your
+feature set unless you're demonstrating leakage detection.
+
+## Dataset summary
+
+| | Intro | Intermediate | Advanced |
+|---|---|---|---|
+| Leads | 5,000 | 5,000 | 5,000 |
+| Accounts | 1,500 | 1,500 | 1,500 |
+| Contacts | 4,200 | 4,200 | 4,200 |
+| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* |
+| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |
+| Conversion rate (recipe band) | 24–61% | 12–31% | 4–12% |
+| Conversion rate (median, seeds 42–46) | 42.67% | 21.60% | 8.40% |
+| Signal strength | 0.90 | 0.70 | 0.50 |
+| Noise scale | 0.10 | 0.30 | 0.55 |
+| Missing rate | 2% | 8% | 18% |
+
+\* `student_public` / `research_instructor`. Difficulty is modulated
+by the simulation engine — signal strength on latent-trait weights,
+Gaussian noise on float features, MCAR missingness, outlier rate —
+not post-hoc label flipping.
+
+## The scenario
+
+**Veridian Technologies** is a fictional Series B startup (Austin, US)
+selling **Veridian Procure**, a procurement / AP automation SaaS, to
+mid-market firms (200–2,000 employees) in the US and UK. The funnel
+runs through inbound marketing (45%), SDR outbound (35%), and
+partner referrals (20%); four personas drive deals (VP Finance, AP
+Manager, IT Director, Procurement Manager). **Task:** predict whether
+a lead converts (`closed_won`) within 90 days. ACV bands are
+$18k–$120k. See
+[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md)
+for the full DGP, and the deeper "what's modelled / approximate / not
+modelled" breakdown that this README only summarises.
+
+## Public vs instructor: what's redacted
+
+Filtering happens **during rendering**, not during simulation. The
+redaction contract is single-sourced in
+[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py);
+the snapshot-safe writer and the validator import the same constants,
+so they cannot drift apart.
+
+| Source-of-truth constant | Public bundle treatment |
+|---|---|
+| `BANNED_LEAD_COLUMNS = ("converted_within_90_days", "conversion_timestamp")` | Dropped from `tables/leads.parquet` |
+| `BANNED_OPP_COLUMNS = ("close_outcome", "closed_at")` | Dropped from `tables/opportunities.parquet` |
+| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles |
+| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` |
+| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` |
+| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` |
+
+Each bundle's `manifest.json` records `relational_snapshot_safe`,
+`redacted_columns`, and `snapshot_day`, so the bundle is
+self-describing.
+
+## Calibration
+
+Every realism / calibration / difficulty claim in this README is
+backed by
+[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md),
+regenerated by
+[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py)
+with bands declared in
+[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml).
+Headline cross-seed medians (seeds 42–46):
+
+| Tier | LR AUC | AP | P@100 | Brier |
+|---|---|---|---|---|
+| intro | 0.879 | 0.761 | 0.80 | 0.130 |
+| intermediate | 0.886 | 0.575 | 0.59 | 0.110 |
+| advanced | 0.886 | 0.351 | 0.34 | 0.061 |
+
+AP, P@100, conversion-rate, and lift orderings hold across the
+intended difficulty axis (intro > intermediate > advanced).
+
+## Intended uses
+
+- Teaching baseline lead-scoring on a flat snapshot.
+- Teaching relational feature engineering against snapshot-safe tables.
+- Teaching leakage detection (the `total_touches_all` trap is
+  designed to be discoverable).
+- Teaching calibration, lift, P@K, value-aware ranking
+  (`expected_acv × P(convert)`), and cohort-shift evaluation.
+- Comparing model families under a controlled DGP.
+
+## Out-of-scope uses
+
+- **Production lead scoring.** The company, product, and customers are
+  fictional.
+- **Vendor benchmarking / paper baselines.** Difficulty tiers are
+  calibrated for pedagogy, not cross-paper comparability.
+- **Causal-inference research that requires recovery of the true DGP.**
+  The instructor companion exposes the hidden graph for teaching, not
+  designed counterfactuals.
+- **Demographic / fairness research.** v1 does not model protected
+  attributes.
+
+## Known limitations
+
+- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across
+  every tier. Difficulty is visible in AP, P@K, Brier, and value
+  capture. Treat AUC as a sanity check, not a difficulty signal.
+- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta
+  is slightly negative in every tier (intro −0.0045, intermediate
+  −0.0072, advanced −0.0133); v1's snapshot is dominated by linear
+  features. v2 will inject non-linear interactions in the simulator.
+- **Channel signal is weak.** Per
+  [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md),
+  out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across
+  all tiers and the per-channel rate spread is ≤0.05. The simulator
+  does not encode channel-conditional probabilities; channel-conditional
+  encoding is post-v1 work.
+- **Cohort-shift degradation is small.** v1 has no time-of-year drift
+  baked in; the cohort-shift gate (G6.4) is informational and will
+  bite in v2.
+
+## Composition
+
+- **Entities.** Accounts, contacts, leads, touches, sessions,
+  sales_activities, opportunities (public); plus customers and
+  subscriptions (instructor only). Per-row counts per bundle live in
+  `manifest.json`.
+- **Features.** 32 public columns grouped by analytical role in
+  [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md);
+  the per-bundle `feature_dictionary.csv` is the authoritative
+  machine-readable spec.
+- **Label.** `converted_within_90_days` (boolean), event-derived from
+  the simulator. Never sampled directly.
+- **Splits.** 70/15/15 train/valid/test, deterministic given seed;
+  recorded in `tasks/converted_within_90_days/task_manifest.json`.
+- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package
+  version stamped in `manifest.json`.
+
+## Maintenance, adversarial framing, license
+
+We *want* the dataset to be broken. Issue templates ship under
+`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as
+`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships,
+`docs/release/v2_decision_log.md` will track every accepted finding
+and the design call that came from it. File issues at
+[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);
+PRs welcome.
+
+| Field | Value |
+|---|---|
+| Generator | leadforge `1.0.0+` |
+| Recipe | `b2b_saas_procurement_v1` |
+| Canonical seed | 42 (cross-seed sweep: 42–46) |
+| Bundle schema version | 5 |
+| Format | Parquet (canonical) + CSV (convenience) |
+| License | MIT — see [LICENSE](LICENSE) |
+
+Verify integrity with `leadforge validate <bundle_dir>`; every file
+is hashed in `manifest.json`.
diff --git a/scripts/_release_common.py b/scripts/_release_common.py
new file mode 100644
index 0000000..b1b61f5
--- /dev/null
+++ b/scripts/_release_common.py
@@ -0,0 +1,133 @@
+"""Shared primitives for the Phase 5 release packagers.
+
+Both ``scripts/package_kaggle_release.py`` (PR 5.1) and
+``scripts/package_hf_release.py`` (PR 5.2) need to:
+
+* rewrite the ``](../foo)`` markdown links in ``release/README.md`` to
+  GitHub blob URLs — the README lives one level above each upload tree,
+  so the relative links break on Kaggle / HF and have to point at the
+  source repo;
+* rewrite the ``](validation/validation_report.md)`` link to a GitHub
+  blob URL — the validation report does not ship with either upload
+  tree;
+* refuse to assemble into ``cwd`` / the release dir / its parent / the
+  filesystem anchor (the assembler rmtrees children of the upload dir);
+* validate the dataset cover image's dimensions against Kaggle's floor
+  (HF reuses the same PNG for its thumbnail).
+
+Lifting the four to one module keeps the rules in one place — if Kaggle
+ever extends the dimension floor, both packagers pick it up.
+``FieldDescriptor`` / ``ResourceSchema`` / dtype maps are deliberately
+NOT extracted: HF infers schema from parquet via ``load_dataset`` and
+does not need a Frictionless-shaped declaration.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Final
+
+from PIL import Image
+
+# ---------------------------------------------------------------------------
+# README link rewriting
+# ---------------------------------------------------------------------------
+
+GITHUB_BLOB_BASE: Final[str] = "https://github.com/leadforge-dev/leadforge/blob/main"
+
+#: Inline relative link ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)``
+#: for any markdown link that escapes the bundle root.
+PARENT_RELATIVE_LINK_RE: Final[re.Pattern[str]] = re.compile(r"\]\(\.\./([^)]+)\)")
+
+#: The README points at ``validation/validation_report.md`` (a path
+#: that lives under ``release/`` but never under either upload tree).
+#: Rewrite to a GitHub blob URL so the link works on both platforms.
+VALIDATION_REPORT_LINK: Final[str] = "](validation/validation_report.md)"
+VALIDATION_REPORT_URL: Final[str] = f"]({GITHUB_BLOB_BASE}/release/validation/validation_report.md)"
+
+
+def rewrite_release_links(text: str) -> str:
+    """Apply the platform-agnostic README rewrites.
+
+    Both Kaggle and HF use these. Tree-block substitution is
+    platform-specific (each upload tree looks different) and lives in
+    each packager's own module.
+    """
+
+    text = PARENT_RELATIVE_LINK_RE.sub(rf"]({GITHUB_BLOB_BASE}/\1)", text)
+    text = text.replace(VALIDATION_REPORT_LINK, VALIDATION_REPORT_URL)
+    return text
+
+
+# ---------------------------------------------------------------------------
+# Cover-image validation (G11.2; HF reuses the same asset)
+# ---------------------------------------------------------------------------
+
+#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with
+#: 2:1 header / 1:1 thumbnail crops in mind. HF accepts any reasonable
+#: dimension for thumbnails so the Kaggle floor is the binding one.
+COVER_IMAGE_MIN_WIDTH: Final[int] = 560
+COVER_IMAGE_MIN_HEIGHT: Final[int] = 280
+
+
+@dataclass(frozen=True)
+class ValidationError:
+    """One field-level validation failure."""
+
+    field: str
+    message: str
+
+
+def validate_cover_image(path: Path) -> list[ValidationError]:
+    """Validate that ``path`` exists and meets the dimension floor."""
+
+    errors: list[ValidationError] = []
+    if not path.exists():
+        errors.append(
+            ValidationError(
+                field="cover_image",
+                message=f"cover image not found at {path}",
+            )
+        )
+        return errors
+    with Image.open(path) as img:
+        width, height = img.size
+    if width < COVER_IMAGE_MIN_WIDTH or height < COVER_IMAGE_MIN_HEIGHT:
+        errors.append(
+            ValidationError(
+                field="cover_image",
+                message=(
+                    f"cover image {width}x{height} below minimum "
+                    f"{COVER_IMAGE_MIN_WIDTH}x{COVER_IMAGE_MIN_HEIGHT}"
+                ),
+            )
+        )
+    return errors
+
+
+# ---------------------------------------------------------------------------
+# Upload-dir safety
+# ---------------------------------------------------------------------------
+
+
+def validate_upload_dir_safe(upload_dir: Path, release_dir: Path, *, kind: str) -> None:
+    """Refuse to assemble into a path that aliases something dangerous.
+
+    The packagers replace children of ``upload_dir`` (rmtree + recopy)
+    so pointing it at ``cwd`` / ``release_dir`` / their parents / the
+    filesystem anchor would clobber unrelated content. ``kind`` is
+    folded into the error message so the trace says which packager
+    refused (``kaggle`` / ``huggingface`` / ``huggingface-instructor``).
+    """
+
+    resolved = upload_dir.resolve()
+    blocked = {
+        Path(resolved.anchor),
+        Path.cwd().resolve(),
+        release_dir.resolve(),
+        release_dir.resolve().parent,
+    }
+    if resolved in blocked:
+        raise ValueError(f"refusing to assemble into unsafe --{kind}-dir: {upload_dir}")
diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py
new file mode 100644
index 0000000..17a72db
--- /dev/null
+++ b/scripts/package_hf_release.py
@@ -0,0 +1,852 @@
+#!/usr/bin/env python3
+"""Package the ``leadforge-lead-scoring-v1`` family for Hugging Face.
+
+PR 5.2 — second of two PRs in Phase 5 (Platform packaging) of the v1
+release roadmap.  This script:
+
+1. Reads each public tier's ``manifest.json`` + flat CSV header under
+   ``release/`` and assembles a Hugging Face dataset card
+   (``release/huggingface/README.md``) whose YAML frontmatter satisfies
+   G12.1 of ``docs/release/v1_acceptance_gates.md``: ``pretty_name``,
+   ``license: mit``, ``language: en``, ``task_categories:
+   [tabular-classification]``, ``size_categories``, ``tags``, and one
+   ``configs`` block per public tier with ``data_files`` pointing at the
+   parquet task splits in the upload tree.  Exactly one config carries
+   ``default: true`` (G12.2).
+2. Reuses ``release/dataset-cover-image.png`` as the dataset thumbnail
+   (HF datasets accept any reasonable PNG; the Kaggle floor of
+   560×280 is the binding constraint and the validator lives in
+   ``scripts/_release_common.py`` so both packagers share it).
+3. Optionally assembles a HF-loadable upload directory under
+   ``release/huggingface/`` as real-file copies of the per-tier
+   bundles plus the rewritten README.  Same lesson as PR 5.1: don't
+   symlink heavy bundle dirs; HF's ``datasets`` library walks the
+   upload tree and silently skips broken symlinks in some versions.
+4. Supports ``--variant=instructor`` to package the
+   ``leadforge-lead-scoring-v1-instructor`` companion repo (G12.4)
+   from ``release/intermediate_instructor/`` into a separate
+   ``release/huggingface-instructor/`` tree with one config
+   (``intermediate``).  Same code path; different defaults.
+
+The actual ``huggingface_hub.HfApi().upload_folder()`` call lives in
+PR 7.2; this script is intentionally publish-free.  ``--dry-run``
+writes the README only and skips upload-tree assembly, useful for
+README iteration.
+
+Failed validation exits with rc=1; pre-flight errors (missing release
+dir, missing tier, unsafe ``--huggingface-dir``) exit with rc=2.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+from collections.abc import Sequence
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Final
+
+# Make ``scripts/`` importable regardless of how this file is loaded.
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+# ``GITHUB_BLOB_BASE`` is re-exported for tests and downstream callers
+# (mirrors the symbol surface of ``scripts/package_kaggle_release.py``).
+from _release_common import (  # noqa: E402,F401 — must follow sys.path insert
+    GITHUB_BLOB_BASE,
+    ValidationError,
+    rewrite_release_links,
+    validate_cover_image,
+    validate_upload_dir_safe,
+)
+
+# ---------------------------------------------------------------------------
+# Hugging Face dataset-card YAML schema (chatgpt v2 §20, verified from
+# https://huggingface.co/docs/hub/datasets-cards)
+# ---------------------------------------------------------------------------
+
+#: Allowed ``size_categories`` tokens (HF taxonomy).  Each public tier
+#: has 5,000 leads (3500 train / 750 valid / 750 test) so ``1K<n<10K``
+#: is the right bucket; constants are kept here so ``--variant=instructor``
+#: can pick the right bucket too if the tier sizes ever change.
+HF_SIZE_BUCKET_5K: Final[str] = "1K<n<10K"
+
+#: Public-tier configs land under ``intro``/``intermediate``/``advanced``;
+#: ``intermediate`` is the recommended entry point per G12.2 (the
+#: difficulty band sits between the two extremes).  Picking
+#: ``intermediate`` as default mirrors ``release/HF_DATASET_CARD.md``
+#: (the legacy stub PR 5.2 supersedes).
+DEFAULT_DEFAULT_CONFIG: Final[str] = "intermediate"
+
+#: Allowed HF dataset-card ``task_categories`` token for tabular
+#: binary classification.
+HF_TASK_CATEGORY: Final[str] = "tabular-classification"
+
+# ---------------------------------------------------------------------------
+# Release-specific defaults
+# ---------------------------------------------------------------------------
+
+DEFAULT_OWNER: Final[str] = "leadforge"
+DEFAULT_DATASET_SLUG: Final[str] = "leadforge-lead-scoring-v1"
+DEFAULT_INSTRUCTOR_DATASET_SLUG: Final[str] = "leadforge-lead-scoring-v1-instructor"
+
+DEFAULT_PRETTY_NAME: Final[str] = "LeadForge: Synthetic B2B Lead Scoring (v1)"
+DEFAULT_INSTRUCTOR_PRETTY_NAME: Final[str] = (
+    "LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion"
+)
+
+#: Tag list per the v1 release design (chatgpt v2 §20) — kept sorted at
+#: render time to make determinism obvious in ``git diff`` output.
+DEFAULT_TAGS: Final[tuple[str, ...]] = (
+    "b2b",
+    "crm",
+    "datasets",
+    "lead-scoring",
+    "pandas",
+    "synthetic-data",
+    "tabular",
+)
+
+DEFAULT_LICENSE: Final[str] = "mit"
+DEFAULT_LANGUAGE: Final[str] = "en"
+
+DEFAULT_PUBLIC_TIERS: Final[tuple[str, ...]] = ("intro", "intermediate", "advanced")
+DEFAULT_INSTRUCTOR_TIERS: Final[tuple[str, ...]] = ("intermediate_instructor",)
+
+#: HF expects the validation split to be called ``validation`` (HF
+#: convention; ``valid`` would still load but the viewer labels it
+#: differently).  The committed parquet is ``valid.parquet``; the
+#: ``configs[*].data_files`` mapping bridges the two.
+DEFAULT_TASK: Final[str] = "converted_within_90_days"
+HF_SPLIT_NAMES: Final[tuple[tuple[str, str], ...]] = (
+    ("train", "train"),
+    ("validation", "valid"),
+    ("test", "test"),
+)
+
+DEFAULT_RELEASE_DIR: Final[Path] = Path("release")
+DEFAULT_HUGGINGFACE_DIR: Final[Path] = Path("release/huggingface")
+DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR: Final[Path] = Path("release/huggingface-instructor")
+DEFAULT_COVER_IMAGE: Final[Path] = Path("release/dataset-cover-image.png")
+
+#: HF dataset cards do not require a cover image, but the field is
+#: harmless and keeps the published thumbnail aligned with Kaggle's.
+COVER_IMAGE_FILENAME: Final[str] = "dataset-cover-image.png"
+
+# ---------------------------------------------------------------------------
+# README rewriting — HF-specific tree block
+# ---------------------------------------------------------------------------
+
+#: Source-repo tree diagram from ``release/README.md`` — must match the
+#: PR 4.1 README byte-for-byte; the silent-failure guard
+#: (:func:`_validate_readme_substitution`) fires when the two drift
+#: apart (mirrors PR 5.1's ``KAGGLE_TREE_BLOCK`` validator).
+HF_TREE_BLOCK_SOURCE: Final[str] = """```
+release/
+├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
+│   ├── manifest.json                 # provenance + file hashes
+│   ├── dataset_card.md               # auto-rendered per-bundle card
+│   ├── feature_dictionary.csv        # authoritative column spec
+│   ├── lead_scoring.csv              # flat convenience CSV (all splits)
+│   ├── tables/*.parquet              # 7 snapshot-safe relational tables
+│   └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── intermediate_instructor/          # research companion: full-horizon tables + metadata/
+├── notebooks/01_baseline_lead_scoring.ipynb
+└── validation/                       # validation_report.{json,md} + figures
+```"""
+
+HF_UPLOAD_TREE_BLOCK: Final[str] = """```
+.
+├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
+│   ├── manifest.json                 # provenance + file hashes
+│   ├── dataset_card.md               # auto-rendered per-bundle card
+│   ├── feature_dictionary.csv        # authoritative column spec
+│   ├── lead_scoring.csv              # flat convenience CSV (all splits)
+│   ├── tables/*.parquet              # 7 snapshot-safe relational tables
+│   └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── README.md                         # this file (HF dataset card)
+├── dataset-cover-image.png           # dataset thumbnail
+└── LICENSE
+```"""
+
+HF_INSTRUCTOR_UPLOAD_TREE_BLOCK: Final[str] = """```
+.
+├── intermediate/                     # research_instructor companion: full-horizon
+│   ├── manifest.json                 # provenance + file hashes
+│   ├── dataset_card.md               # auto-rendered per-bundle card
+│   ├── feature_dictionary.csv        # authoritative column spec
+│   ├── tables/*.parquet              # 9 full-horizon tables (incl. customers, subscriptions)
+│   ├── tasks/converted_within_90_days/{train,valid,test}.parquet
+│   └── metadata/                     # world_spec, graph.{graphml,json}, latent_registry, etc.
+├── README.md                         # this file (HF dataset card)
+├── dataset-cover-image.png           # dataset thumbnail
+└── LICENSE
+```"""
+
+
+def _hf_readme_text(readme: str, *, tree_block: str = HF_UPLOAD_TREE_BLOCK) -> str:
+    """Apply the HF-specific rewrites to a copy of the release README.
+
+    Rewrites:
+
+    1. Source-repo tree diagram → upload-tree diagram (the published
+       README should describe what the *user* sees on HF, not the
+       source repo layout).
+    2. ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)`` and
+       ``](validation/...)`` → blob URL — both via
+       ``rewrite_release_links`` from ``_release_common``.
+
+    The instructor variant calls this with
+    ``tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK``; otherwise the
+    public upload tree is used.
+    """
+
+    text = readme.replace(HF_TREE_BLOCK_SOURCE, tree_block)
+    text = rewrite_release_links(text)
+    return text
+
+
+def _validate_readme_substitution(release_dir: Path) -> list[ValidationError]:
+    """Guard against silent drift between the README's tree diagram and
+    ``HF_TREE_BLOCK_SOURCE`` (mirrors PR 5.1's Kaggle-side guard).
+
+    Plain string ``replace()`` silently no-ops when the substring is
+    not found, which would publish the source-repo tree diagram on HF.
+    """
+
+    readme = release_dir / "README.md"
+    if not readme.exists():
+        return []
+    if HF_TREE_BLOCK_SOURCE not in readme.read_text(encoding="utf-8"):
+        return [
+            ValidationError(
+                field="release/README.md",
+                message=(
+                    "HF_TREE_BLOCK_SOURCE not found verbatim in release/README.md; "
+                    "the source-repo tree diagram in the README has drifted from "
+                    "the constant in scripts/package_hf_release.py — the "
+                    "HF README rewrite will silently no-op until the README and "
+                    "HF_TREE_BLOCK_SOURCE are reconciled."
+                ),
+            )
+        ]
+    return []
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses — one per top-level YAML block
+#
+# These are typed records, not invariants. Construction is unchecked;
+# callers MUST run :func:`validate_card` before relying on the metadata
+# being well-formed (mirrors the discipline in PR 5.1).
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class DataFileEntry:
+    """One ``configs[*].data_files[]`` entry."""
+
+    split: str
+    path: str
+
+
+@dataclass(frozen=True)
+class ConfigEntry:
+    """One ``configs[]`` entry inside the YAML frontmatter."""
+
+    config_name: str
+    data_files: tuple[DataFileEntry, ...]
+    default: bool = False
+
+
+@dataclass(frozen=True)
+class HuggingFaceCard:
+    """Top-level HF dataset-card payload.
+
+    ``body`` is the markdown that follows the YAML frontmatter — by
+    default the rewritten ``release/README.md`` (G12.1 expects the
+    README to BE the dataset card; the YAML lives at the top).
+    """
+
+    pretty_name: str
+    license: str
+    language: tuple[str, ...]
+    task_categories: tuple[str, ...]
+    size_categories: tuple[str, ...]
+    tags: tuple[str, ...]
+    configs: tuple[ConfigEntry, ...]
+    body: str
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+
+def validate_card(card: HuggingFaceCard) -> list[ValidationError]:
+    """Run every HF-side check against a built ``HuggingFaceCard``.
+
+    Mirrors ``validate_metadata`` in the Kaggle packager: collects all
+    field-level errors at once rather than fail-fasting.  G12.1 +
+    G12.2 are surfaced here; G12.3 (load_dataset round-trip) is a
+    separate test that hits the actual HF library.
+    """
+
+    errors: list[ValidationError] = []
+
+    if not card.pretty_name.strip():
+        errors.append(ValidationError(field="pretty_name", message="must be non-empty"))
+
+    if card.license != DEFAULT_LICENSE:
+        errors.append(
+            ValidationError(
+                field="license",
+                message=f"expected {DEFAULT_LICENSE!r}, got {card.license!r}",
+            )
+        )
+
+    if not card.language:
+        errors.append(ValidationError(field="language", message="must contain at least one entry"))
+
+    if HF_TASK_CATEGORY not in card.task_categories:
+        errors.append(
+            ValidationError(
+                field="task_categories",
+                message=f"must contain {HF_TASK_CATEGORY!r}",
+            )
+        )
+
+    if not card.size_categories:
+        errors.append(
+            ValidationError(field="size_categories", message="must contain at least one entry")
+        )
+
+    if not card.tags:
+        errors.append(ValidationError(field="tags", message="must contain at least one entry"))
+
+    if not card.configs:
+        errors.append(ValidationError(field="configs", message="must contain at least one config"))
+
+    # G12.2 — exactly one config has default: true.
+    defaults = [c for c in card.configs if c.default]
+    if len(defaults) != 1:
+        errors.append(
+            ValidationError(
+                field="configs",
+                message=(
+                    f"exactly one config must have default: true, got {len(defaults)} "
+                    f"({[c.config_name for c in defaults]!r})"
+                ),
+            )
+        )
+
+    # Per-config integrity: non-empty data_files, every entry has a
+    # split + a path.
+    for i, config in enumerate(card.configs):
+        if not config.config_name.strip():
+            errors.append(
+                ValidationError(field=f"configs[{i}].config_name", message="must be non-empty")
+            )
+        if not config.data_files:
+            errors.append(
+                ValidationError(
+                    field=f"configs[{i}].data_files",
+                    message="must contain at least one entry",
+                )
+            )
+            continue
+        for j, df in enumerate(config.data_files):
+            if not df.split or not df.path:
+                errors.append(
+                    ValidationError(
+                        field=f"configs[{i}].data_files[{j}]",
+                        message="each entry must declare both split and path",
+                    )
+                )
+
+    return errors
+
+
+# ---------------------------------------------------------------------------
+# Bundle reading + config building
+# ---------------------------------------------------------------------------
+
+
+def _load_manifest(path: Path) -> dict[str, Any]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError(f"manifest.json at {path} is not a JSON object")
+    return payload
+
+
+def build_config_for_tier(
+    release_dir: Path,
+    *,
+    tier_dir: str,
+    config_name: str,
+    task: str = DEFAULT_TASK,
+    default: bool = False,
+) -> ConfigEntry:
+    """Build a single ``configs[]`` entry for one tier.
+
+    ``tier_dir`` is the directory name under the upload tree
+    (``intro`` / ``intermediate`` / ``advanced`` for the public
+    variant; ``intermediate`` for the instructor companion which
+    flattens ``intermediate_instructor/`` → ``intermediate/`` in the
+    upload tree).  ``config_name`` is what users pass to
+    ``load_dataset(..., name=...)``.
+
+    The function reads ``manifest.json`` to confirm the tier exists
+    and the task splits are declared; it does NOT fail when files are
+    missing on disk because the upload tree is reassembled by
+    :func:`assemble_upload_dir` and the manifest is the single source
+    of truth for what should be there.
+    """
+
+    manifest_path = release_dir / tier_dir / "manifest.json"
+    if not manifest_path.exists():
+        raise FileNotFoundError(f"manifest.json missing for tier {tier_dir!r}: {manifest_path}")
+    manifest = _load_manifest(manifest_path)
+    if task not in manifest.get("tasks", {}):
+        raise ValueError(
+            f"task {task!r} not declared in manifest at {manifest_path}; "
+            f"got {sorted(manifest.get('tasks', {}))!r}"
+        )
+
+    data_files = tuple(
+        DataFileEntry(
+            split=hf_split,
+            path=f"{config_name}/tasks/{task}/{file_split}.parquet",
+        )
+        for hf_split, file_split in HF_SPLIT_NAMES
+    )
+    return ConfigEntry(config_name=config_name, data_files=data_files, default=default)
+
+
+def build_card(
+    release_dir: Path,
+    *,
+    variant: str = "public",
+    pretty_name: str | None = None,
+    license_id: str = DEFAULT_LICENSE,
+    language: Sequence[str] = (DEFAULT_LANGUAGE,),
+    task_categories: Sequence[str] = (HF_TASK_CATEGORY,),
+    size_categories: Sequence[str] = (HF_SIZE_BUCKET_5K,),
+    tags: Sequence[str] = DEFAULT_TAGS,
+    default_config: str = DEFAULT_DEFAULT_CONFIG,
+    task: str = DEFAULT_TASK,
+    body: str | None = None,
+) -> HuggingFaceCard:
+    """Assemble a ``HuggingFaceCard`` from the release tree.
+
+    ``variant="public"`` builds the three-tier card pointing at
+    ``release/{intro,intermediate,advanced}/``;
+    ``variant="instructor"`` builds a single-config card pointing at
+    ``release/intermediate_instructor/`` (flattened to
+    ``intermediate/`` in the upload tree).
+
+    When ``body`` is ``None`` (the default) we lift the contents of
+    ``release/README.md`` and apply the HF-specific rewrites — HF
+    renders the body as the dataset card so a full README there is
+    more useful than a curated blurb (mirrors the Kaggle packager's
+    description handling).
+    """
+
+    if variant == "public":
+        public_tiers = (
+            ("intro", "intro"),
+            ("intermediate", "intermediate"),
+            ("advanced", "advanced"),
+        )
+        configs = tuple(
+            build_config_for_tier(
+                release_dir,
+                tier_dir=tier_dir,
+                config_name=config_name,
+                task=task,
+                default=(config_name == default_config),
+            )
+            for tier_dir, config_name in public_tiers
+        )
+        if pretty_name is None:
+            pretty_name = DEFAULT_PRETTY_NAME
+        if body is None:
+            body = _hf_readme_text(
+                (release_dir / "README.md").read_text(encoding="utf-8"),
+                tree_block=HF_UPLOAD_TREE_BLOCK,
+            )
+    elif variant == "instructor":
+        # Companion repo flattens ``intermediate_instructor`` →
+        # ``intermediate`` in the upload tree so the HF dataset slug
+        # ``leadforge-lead-scoring-v1-instructor`` exposes the
+        # familiar config name.
+        configs = (
+            build_config_for_tier(
+                release_dir,
+                tier_dir="intermediate_instructor",
+                config_name="intermediate",
+                task=task,
+                default=True,
+            ),
+        )
+        if pretty_name is None:
+            pretty_name = DEFAULT_INSTRUCTOR_PRETTY_NAME
+        if body is None:
+            body = _hf_readme_text(
+                (release_dir / "README.md").read_text(encoding="utf-8"),
+                tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK,
+            )
+    else:
+        raise ValueError(f"unknown variant: {variant!r} (expected 'public' or 'instructor')")
+
+    return HuggingFaceCard(
+        pretty_name=pretty_name,
+        license=license_id,
+        language=tuple(language),
+        task_categories=tuple(task_categories),
+        size_categories=tuple(size_categories),
+        tags=tuple(tags),
+        configs=configs,
+        body=body,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Rendering
+#
+# YAML is hand-rolled rather than dumped by PyYAML because the HF
+# dataset-card YAML has a precise key order and indentation style that
+# the viewer is fussy about (and PyYAML's default flow style collapses
+# the configs list into a single line).  Hand-rolling also makes the
+# determinism contract obvious in the source.
+# ---------------------------------------------------------------------------
+# ---------------------------------------------------------------------------
+
+
+def _yaml_string(value: str) -> str:
+    """Render a YAML string scalar.
+
+    HF parses the frontmatter with PyYAML.  We use double-quoted
+    strings only when the value contains characters that would change
+    YAML semantics (``:``, leading ``- ``, leading whitespace).  Plain
+    scalars are used otherwise so the YAML stays readable.
+    """
+
+    if value == "":
+        return '""'
+    needs_quoting = any(c in value for c in ":#&*!|>'\"%@`")
+    needs_quoting = needs_quoting or value.startswith(("- ", " ", "?", "[", "{", "*", "&"))
+    needs_quoting = needs_quoting or value.endswith(" ")
+    if not needs_quoting:
+        return value
+    escaped = value.replace("\\", "\\\\").replace('"', '\\"')
+    return f'"{escaped}"'
+
+
+def _yaml_list(values: Sequence[str], *, indent: int) -> list[str]:
+    """Render a YAML block-style list."""
+
+    pad = " " * indent
+    return [f"{pad}- {_yaml_string(v)}" for v in values]
+
+
+def render_yaml_frontmatter(card: HuggingFaceCard) -> str:
+    """Render the YAML frontmatter as a deterministic string.
+
+    Order: ``pretty_name``, ``license``, ``language``,
+    ``task_categories``, ``size_categories``, ``tags``, ``configs``.
+    Tags are sorted at render time (mirrors the keyword sort in the
+    Kaggle packager) so order on the dataclass doesn't leak into the
+    rendered file.
+    """
+
+    lines: list[str] = ["---"]
+    lines.append(f"pretty_name: {_yaml_string(card.pretty_name)}")
+    lines.append(f"license: {_yaml_string(card.license)}")
+    lines.append("language:")
+    lines.extend(_yaml_list(card.language, indent=2))
+    lines.append("task_categories:")
+    lines.extend(_yaml_list(card.task_categories, indent=2))
+    lines.append("size_categories:")
+    lines.extend(_yaml_list(card.size_categories, indent=2))
+    lines.append("tags:")
+    lines.extend(_yaml_list(sorted(card.tags), indent=2))
+    lines.append("configs:")
+    for config in card.configs:
+        lines.append(f"  - config_name: {_yaml_string(config.config_name)}")
+        if config.default:
+            lines.append("    default: true")
+        lines.append("    data_files:")
+        for df in config.data_files:
+            lines.append(f"      - split: {_yaml_string(df.split)}")
+            lines.append(f"        path: {_yaml_string(df.path)}")
+    lines.append("---")
+    return "\n".join(lines) + "\n"
+
+
+def render_card(card: HuggingFaceCard) -> str:
+    """Render the full README (YAML frontmatter + markdown body)."""
+
+    body = card.body
+    if not body.endswith("\n"):
+        body += "\n"
+    return render_yaml_frontmatter(card) + "\n" + body
+
+
+# ---------------------------------------------------------------------------
+# Upload-directory assembly
+# ---------------------------------------------------------------------------
+
+
+def _replace_file(src: Path, dst: Path) -> None:
+    """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``."""
+
+    if dst.is_symlink() or dst.is_file():
+        dst.unlink()
+    elif dst.exists() and dst.is_dir():
+        shutil.rmtree(dst)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src, dst)
+
+
+def _replace_dir(src: Path, dst: Path) -> None:
+    """Copy directory ``src`` → ``dst``, replacing any existing entry."""
+
+    if dst.is_symlink() or dst.is_file():
+        dst.unlink()
+    elif dst.exists() and dst.is_dir():
+        shutil.rmtree(dst)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copytree(src, dst)
+
+
+def assemble_upload_dir(
+    release_dir: Path,
+    upload_dir: Path,
+    *,
+    variant: str = "public",
+    cover_image: Path = DEFAULT_COVER_IMAGE,
+) -> None:
+    """Assemble ``upload_dir`` for ``huggingface_hub.upload_folder()``.
+
+    Mirrors PR 5.1's Kaggle assembler: real-file copies of the per-
+    tier bundles + cover image + LICENSE + the rewritten README.  No
+    symlinks (the ``datasets`` library walks the upload tree and silent
+    skips broken symlinks in some versions).
+
+    For ``variant="instructor"``, the source directory
+    ``intermediate_instructor/`` is flattened to ``intermediate/`` in
+    the upload tree so ``load_dataset(..., name="intermediate")``
+    works against the companion repo without naming friction.
+    """
+
+    kind = "huggingface" if variant == "public" else "huggingface-instructor"
+    validate_upload_dir_safe(upload_dir, release_dir, kind=kind)
+    upload_dir.mkdir(parents=True, exist_ok=True)
+
+    # Cover image — reuse the one ``release/`` already ships.
+    cover_src = release_dir / cover_image.name
+    if not cover_src.exists():
+        cover_src = cover_image
+    if cover_src.exists():
+        _replace_file(cover_src, upload_dir / COVER_IMAGE_FILENAME)
+
+    # LICENSE — straight copy, no rewriting.
+    license_src = release_dir / "LICENSE"
+    if license_src.exists():
+        _replace_file(license_src, upload_dir / "LICENSE")
+
+    # Per-tier bundles — full directory copies.  The instructor variant
+    # flattens its source dir name.
+    if variant == "public":
+        for tier in DEFAULT_PUBLIC_TIERS:
+            _replace_dir(release_dir / tier, upload_dir / tier)
+    elif variant == "instructor":
+        _replace_dir(
+            release_dir / "intermediate_instructor",
+            upload_dir / "intermediate",
+        )
+    else:
+        raise ValueError(f"unknown variant: {variant!r} (expected 'public' or 'instructor')")
+
+
+# ---------------------------------------------------------------------------
+# Driver
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class PackagerOutcome:
+    """Return value from :func:`run_packager` — used by tests + CLI."""
+
+    card: HuggingFaceCard
+    readme_path: Path
+    errors: tuple[ValidationError, ...]
+    assembled: bool
+
+
+def run_packager(
+    release_dir: Path,
+    *,
+    huggingface_dir: Path = DEFAULT_HUGGINGFACE_DIR,
+    variant: str = "public",
+    pretty_name: str | None = None,
+    default_config: str = DEFAULT_DEFAULT_CONFIG,
+    task: str = DEFAULT_TASK,
+    cover_image: Path = DEFAULT_COVER_IMAGE,
+    dry_run: bool = False,
+) -> PackagerOutcome:
+    """Build, validate, and write the HF dataset card.
+
+    With ``dry_run=False`` (the default) the packager additionally
+    assembles the HF-loadable upload directory under
+    ``huggingface_dir`` (real-file copies of the per-tier bundles +
+    cover image + LICENSE).  ``dry_run=True`` skips the assembly step
+    entirely — useful for fast README iteration.
+    """
+
+    if not release_dir.exists():
+        raise FileNotFoundError(f"release directory not found: {release_dir}")
+
+    card = build_card(
+        release_dir,
+        variant=variant,
+        pretty_name=pretty_name,
+        default_config=default_config,
+        task=task,
+    )
+
+    errors: list[ValidationError] = []
+    errors.extend(validate_card(card))
+    errors.extend(validate_cover_image(cover_image))
+    errors.extend(_validate_readme_substitution(release_dir))
+
+    readme_path = huggingface_dir / "README.md"
+    readme_path.parent.mkdir(parents=True, exist_ok=True)
+    readme_path.write_text(render_card(card), encoding="utf-8")
+
+    if not dry_run:
+        assemble_upload_dir(
+            release_dir,
+            huggingface_dir,
+            variant=variant,
+            cover_image=cover_image,
+        )
+
+    return PackagerOutcome(
+        card=card,
+        readme_path=readme_path,
+        errors=tuple(errors),
+        assembled=not dry_run,
+    )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def _parse_args(argv: Sequence[str] | None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate and validate Hugging Face README.md for leadforge-lead-scoring-v1.",
+    )
+    parser.add_argument(
+        "--release-dir",
+        type=Path,
+        default=DEFAULT_RELEASE_DIR,
+        help="release bundle root containing one subdirectory per tier (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--huggingface-dir",
+        type=Path,
+        default=None,
+        help=(
+            "output directory for README.md + assembled upload tree "
+            "(default: release/huggingface for variant=public, "
+            "release/huggingface-instructor for variant=instructor)"
+        ),
+    )
+    parser.add_argument(
+        "--variant",
+        choices=("public", "instructor"),
+        default="public",
+        help="public (3-tier) or instructor (companion repo); default: %(default)s",
+    )
+    parser.add_argument(
+        "--default-config",
+        default=DEFAULT_DEFAULT_CONFIG,
+        help=(
+            "config that carries default: true (G12.2; "
+            "default: %(default)s for public; ignored for instructor)"
+        ),
+    )
+    parser.add_argument(
+        "--owner",
+        default=DEFAULT_OWNER,
+        help=(
+            "HF dataset owner — currently informational; PR 7.2 will "
+            "consume it via ``huggingface_hub.HfApi().upload_folder``. "
+            "default: %(default)s"
+        ),
+    )
+    parser.add_argument(
+        "--dataset-slug",
+        default=None,
+        help=(
+            "HF dataset slug — currently informational; PR 7.2 will "
+            "consume it. defaults: leadforge-lead-scoring-v1 (public) / "
+            "leadforge-lead-scoring-v1-instructor (instructor)"
+        ),
+    )
+    parser.add_argument(
+        "--cover-image",
+        type=Path,
+        default=DEFAULT_COVER_IMAGE,
+        help="path to the dataset cover image (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="validate + write README only; skip assembling the upload directory",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+    args = _parse_args(argv)
+    variant: str = args.variant
+    huggingface_dir: Path = args.huggingface_dir or (
+        DEFAULT_HUGGINGFACE_DIR if variant == "public" else DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR
+    )
+
+    try:
+        outcome = run_packager(
+            args.release_dir,
+            huggingface_dir=huggingface_dir,
+            variant=variant,
+            default_config=args.default_config,
+            cover_image=args.cover_image,
+            dry_run=args.dry_run,
+        )
+    except FileNotFoundError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+    except ValueError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+
+    if outcome.errors:
+        print("validation failed:", file=sys.stderr)
+        for err in outcome.errors:
+            print(f"  - {err.field}: {err.message}", file=sys.stderr)
+        return 1
+
+    print(f"wrote {outcome.readme_path}", file=sys.stderr)
+    if outcome.assembled:
+        print(f"assembled upload tree under {huggingface_dir}", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py
index 96065a5..b8221c0 100644
--- a/scripts/package_kaggle_release.py
+++ b/scripts/package_kaggle_release.py
@@ -53,7 +53,30 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
-from PIL import Image
+
+# Make ``scripts/`` importable regardless of how this file is loaded
+# (CLI ``python scripts/package_kaggle_release.py``, or
+# ``importlib.util.spec_from_file_location`` from the test suite).
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+# Shared release-packaging primitives extracted in PR 5.2 so the HF
+# packager can reuse the link-rewriter, the dir-safety guard, and the
+# cover-image validator without duplicating logic.
+from _release_common import (  # noqa: E402 — must follow sys.path insert
+    GITHUB_BLOB_BASE,
+    ValidationError,
+    validate_cover_image,
+    validate_upload_dir_safe,
+)
+from _release_common import (
+    PARENT_RELATIVE_LINK_RE as _PARENT_RELATIVE_LINK,
+)
+from _release_common import (
+    VALIDATION_REPORT_LINK as _VALIDATION_REPORT_LINK,
+)
+from _release_common import (
+    VALIDATION_REPORT_URL as _VALIDATION_REPORT_URL,
+)
 
 # ---------------------------------------------------------------------------
 # Kaggle field constraints (chatgpt v2 §19, verified from official docs)
@@ -75,11 +98,6 @@
     "hourly",
 )
 
-#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with
-#: 2:1 header / 1:1 thumbnail crops in mind.
-COVER_IMAGE_MIN_WIDTH: Final[int] = 560
-COVER_IMAGE_MIN_HEIGHT: Final[int] = 280
-
 #: Allowed cover-image extensions per Kaggle docs.
 ALLOWED_COVER_IMAGE_SUFFIXES: Final[tuple[str, ...]] = (
     ".png",
@@ -190,8 +208,12 @@ class UserSource:
 # ---------------------------------------------------------------------------
 # Description / README rewriting
 # ---------------------------------------------------------------------------
-
-GITHUB_BLOB_BASE: Final[str] = "https://github.com/leadforge-dev/leadforge/blob/main"
+#
+# ``GITHUB_BLOB_BASE``, the ``](../foo)`` regex, and the validation-
+# report link rewrite live in ``_release_common`` (PR 5.2). Tree-block
+# substitution stays local because the upload tree differs per
+# platform (Kaggle ships ``dataset-metadata.json`` + cover image at
+# the top; HF does not).
 
 #: The "What's inside" tree diagram in ``release/README.md``.  The
 #: published README on Kaggle should describe the *upload* layout
@@ -227,19 +249,6 @@ class UserSource:
 └── LICENSE
 ```"""
 
-#: Inline relative link ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)``
-#: for any markdown link that escapes the bundle root.
-_PARENT_RELATIVE_LINK: Final[re.Pattern[str]] = re.compile(r"\]\(\.\./([^)]+)\)")
-
-#: The README points at ``validation/validation_report.md`` (a path
-#: that lives under ``release/`` but not under the Kaggle upload
-#: directory).  Rewrite to a GitHub blob URL so the link works on
-#: Kaggle.
-_VALIDATION_REPORT_LINK: Final[str] = "](validation/validation_report.md)"
-_VALIDATION_REPORT_URL: Final[str] = (
-    f"]({GITHUB_BLOB_BASE}/release/validation/validation_report.md)"
-)
-
 
 def _kaggle_readme_text(readme: str) -> str:
     """Apply the Kaggle-specific rewrites to a copy of the release README.
@@ -338,17 +347,12 @@ class DatasetMetadata:
 
 # ---------------------------------------------------------------------------
 # Validation
+#
+# ``ValidationError`` is imported from ``_release_common`` (PR 5.2);
+# the dataclass is identical to what lived here in PR 5.1.
 # ---------------------------------------------------------------------------
 
 
-@dataclass(frozen=True)
-class ValidationError:
-    """One field-level validation failure."""
-
-    field: str
-    message: str
-
-
 def _validate_length(name: str, value: str, lo: int, hi: int) -> ValidationError | None:
     n = len(value)
     if n < lo or n > hi:
@@ -470,33 +474,6 @@ def validate_metadata(metadata: DatasetMetadata) -> list[ValidationError]:
     return errors
 
 
-def validate_cover_image(path: Path) -> list[ValidationError]:
-    """Validate that ``path`` exists and meets Kaggle's dimension floor."""
-
-    errors: list[ValidationError] = []
-    if not path.exists():
-        errors.append(
-            ValidationError(
-                field="cover_image",
-                message=f"cover image not found at {path}",
-            )
-        )
-        return errors
-    with Image.open(path) as img:
-        width, height = img.size
-    if width < COVER_IMAGE_MIN_WIDTH or height < COVER_IMAGE_MIN_HEIGHT:
-        errors.append(
-            ValidationError(
-                field="cover_image",
-                message=(
-                    f"cover image {width}x{height} below Kaggle minimum "
-                    f"{COVER_IMAGE_MIN_WIDTH}x{COVER_IMAGE_MIN_HEIGHT}"
-                ),
-            )
-        )
-    return errors
-
-
 # ---------------------------------------------------------------------------
 # Bundle reading + resource building
 # ---------------------------------------------------------------------------
@@ -811,21 +788,13 @@ def render_metadata_json(metadata: DatasetMetadata) -> str:
 def _validate_kaggle_dir_safe(kaggle_dir: Path, release_dir: Path) -> None:
     """Refuse to assemble into a path that aliases something dangerous.
 
-    The packager replaces children of ``kaggle_dir`` (rmtree + recopy)
-    so pointing it at ``cwd`` / ``release_dir`` / their parents / the
-    filesystem anchor would clobber unrelated content.  This guard
-    fires before any disk write.
+    Thin wrapper that forwards to ``_release_common.validate_upload_dir_safe``
+    with ``kind="kaggle"`` so the error message identifies which packager
+    refused.  The standalone function is kept for the test in
+    ``tests/scripts/test_package_kaggle_release.py`` that imports it directly.
     """
 
-    resolved = kaggle_dir.resolve()
-    blocked = {
-        Path(resolved.anchor),
-        Path.cwd().resolve(),
-        release_dir.resolve(),
-        release_dir.resolve().parent,
-    }
-    if resolved in blocked:
-        raise ValueError(f"refusing to assemble into unsafe --kaggle-dir: {kaggle_dir}")
+    validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle")
 
 
 def _replace_file(src: Path, dst: Path) -> None:
diff --git a/tests/scripts/test_package_hf_release.py b/tests/scripts/test_package_hf_release.py
new file mode 100644
index 0000000..1328c8d
--- /dev/null
+++ b/tests/scripts/test_package_hf_release.py
@@ -0,0 +1,557 @@
+"""Tests for ``scripts/package_hf_release.py``.
+
+Locks the Phase 5 Hugging Face packaging contract:
+
+* every YAML field surfaced in chatgpt v2 §20 + G12.1 (pretty_name,
+  license, language, task_categories, size_categories, tags, configs
+  with data_files)
+* G12.2 — exactly one config has ``default: true``
+* G12.3 — local ``load_dataset()`` succeeds for every config and
+  returns the expected row counts (gated on the optional ``datasets``
+  package; ``pytest.importorskip`` if absent)
+* G12.4 — instructor companion variant packages independently and
+  loads via ``load_dataset()``
+* the README link-rewriting that lets the published dataset card on
+  HF keep working ``../`` links (rewritten to GitHub blob URLs) and a
+  directory diagram that reflects the upload layout, plus a guard
+  that the source ``HF_TREE_BLOCK_SOURCE`` is still present verbatim
+  in the README (silent-failure trap; mirrors PR 5.1's KAGGLE block)
+* the assembled upload tree resolves every declared resource path
+* the safety net that refuses to assemble into ``cwd`` /
+  ``release_dir`` / its parent
+* byte-equality between the committed
+  ``release/huggingface/README.md`` and a fresh regeneration
+  (audit-artifact-sync pattern from PR 4.1 / PR 5.1)
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+import yaml
+
+_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "package_hf_release.py"
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_spec = importlib.util.spec_from_file_location("package_hf_release", _SCRIPT_PATH)
+assert _spec is not None
+assert _spec.loader is not None
+packager = importlib.util.module_from_spec(_spec)
+sys.modules["package_hf_release"] = packager
+_spec.loader.exec_module(packager)
+
+
+_RELEASE_DIR = _REPO_ROOT / "release"
+_RELEASE_BUNDLES_PRESENT = (_RELEASE_DIR / "intro" / "manifest.json").exists()
+_INSTRUCTOR_BUNDLE_PRESENT = (_RELEASE_DIR / "intermediate_instructor" / "manifest.json").exists()
+_COMMITTED_README = _REPO_ROOT / "release" / "huggingface" / "README.md"
+_COMMITTED_COVER = _REPO_ROOT / "release" / "dataset-cover-image.png"
+
+
+# Canonical task-split row counts per public tier (3500 / 750 / 750)
+# — pinned so the load_dataset() smoke test fails loud rather than
+# silently accept an empty / truncated parquet.
+_EXPECTED_ROWS = {"train": 3500, "validation": 750, "test": 750}
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _minimal_card() -> packager.HuggingFaceCard:
+    """A minimum-viable ``HuggingFaceCard`` that should validate cleanly."""
+
+    return packager.HuggingFaceCard(
+        pretty_name=packager.DEFAULT_PRETTY_NAME,
+        license=packager.DEFAULT_LICENSE,
+        language=(packager.DEFAULT_LANGUAGE,),
+        task_categories=(packager.HF_TASK_CATEGORY,),
+        size_categories=(packager.HF_SIZE_BUCKET_5K,),
+        tags=packager.DEFAULT_TAGS,
+        configs=(
+            packager.ConfigEntry(
+                config_name="intermediate",
+                data_files=(
+                    packager.DataFileEntry(
+                        split="train",
+                        path="intermediate/tasks/converted_within_90_days/train.parquet",
+                    ),
+                ),
+                default=True,
+            ),
+        ),
+        body="# LeadForge\n",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Field-constraint validation (G12.1)
+# ---------------------------------------------------------------------------
+
+
+def test_validate_card_accepts_minimal_canonical_card() -> None:
+    assert packager.validate_card(_minimal_card()) == []
+
+
+def test_validate_card_reports_every_field_violation() -> None:
+    """One bad card triggers every field check at once."""
+
+    bad = packager.HuggingFaceCard(
+        pretty_name="",
+        license="apache-2.0",  # not 'mit'
+        language=(),
+        task_categories=("image-classification",),  # missing tabular-classification
+        size_categories=(),
+        tags=(),
+        configs=(),
+        body="",
+    )
+    errors = packager.validate_card(bad)
+    fields = {e.field for e in errors}
+    assert "pretty_name" in fields
+    assert "license" in fields
+    assert "language" in fields
+    assert "task_categories" in fields
+    assert "size_categories" in fields
+    assert "tags" in fields
+    assert "configs" in fields
+
+
+def test_validate_card_requires_exactly_one_default(tmp_path: Path) -> None:
+    """G12.2 — exactly one config carries ``default: true``."""
+
+    base = _minimal_card()
+
+    # Zero defaults.
+    zero = packager.HuggingFaceCard(
+        **{
+            **base.__dict__,
+            "configs": (
+                packager.ConfigEntry(
+                    config_name="intro",
+                    data_files=(packager.DataFileEntry(split="train", path="intro/x.parquet"),),
+                    default=False,
+                ),
+            ),
+        }
+    )
+    errors = packager.validate_card(zero)
+    assert any("default: true" in e.message for e in errors)
+
+    # Two defaults.
+    two = packager.HuggingFaceCard(
+        **{
+            **base.__dict__,
+            "configs": (
+                packager.ConfigEntry(
+                    config_name="intro",
+                    data_files=(packager.DataFileEntry(split="train", path="intro/x.parquet"),),
+                    default=True,
+                ),
+                packager.ConfigEntry(
+                    config_name="intermediate",
+                    data_files=(
+                        packager.DataFileEntry(split="train", path="intermediate/x.parquet"),
+                    ),
+                    default=True,
+                ),
+            ),
+        }
+    )
+    errors = packager.validate_card(two)
+    assert any("default: true" in e.message for e in errors)
+
+
+def test_validate_card_flags_data_files_without_split_or_path() -> None:
+    """Each ``data_files[]`` entry must declare both split and path."""
+
+    base = _minimal_card()
+    broken = packager.ConfigEntry(
+        config_name="intermediate",
+        data_files=(packager.DataFileEntry(split="", path="intermediate/x.parquet"),),
+        default=True,
+    )
+    bad = packager.HuggingFaceCard(**{**base.__dict__, "configs": (broken,)})
+    errors = packager.validate_card(bad)
+    assert any("split and path" in e.message for e in errors)
+
+
+# ---------------------------------------------------------------------------
+# YAML rendering — round-trip parse + content invariants
+# ---------------------------------------------------------------------------
+
+
+def test_render_yaml_frontmatter_round_trips_through_pyyaml() -> None:
+    """The rendered YAML must parse cleanly via ``yaml.safe_load``.
+
+    HF parses dataset-card frontmatter with PyYAML; if our hand-rolled
+    renderer drifts from valid YAML, the dataset card silently drops
+    its metadata on the HF page.
+    """
+
+    card = _minimal_card()
+    yaml_text = packager.render_yaml_frontmatter(card)
+    # ``render_yaml_frontmatter`` includes the leading and trailing
+    # ``---`` markers.  Strip them before feeding to safe_load.
+    inner = yaml_text.strip().strip("-").strip()
+    parsed = yaml.safe_load(inner)
+    assert parsed["pretty_name"] == card.pretty_name
+    assert parsed["license"] == "mit"
+    assert parsed["language"] == ["en"]
+    assert parsed["task_categories"] == [packager.HF_TASK_CATEGORY]
+    assert parsed["size_categories"] == [packager.HF_SIZE_BUCKET_5K]
+    assert parsed["tags"] == sorted(card.tags)
+    assert len(parsed["configs"]) == 1
+    assert parsed["configs"][0]["default"] is True
+
+
+def test_render_yaml_tags_sorted_at_render_time() -> None:
+    """Tags are sorted in the rendered YAML regardless of dataclass order."""
+
+    base = _minimal_card()
+    shuffled = packager.HuggingFaceCard(
+        **{**base.__dict__, "tags": ("zebra", "alpha", "mango")},
+    )
+    rendered = packager.render_yaml_frontmatter(shuffled)
+    # Tag block lines, in the rendered order.
+    tag_lines = [
+        line.strip().lstrip("- ") for line in rendered.splitlines() if line.startswith("  - ")
+    ]
+    # The first three list-item lines are language, task_categories,
+    # size_categories — find tag block by position after ``tags:``.
+    lines = rendered.splitlines()
+    tags_idx = lines.index("tags:")
+    block: list[str] = []
+    for line in lines[tags_idx + 1 :]:
+        if line.startswith("  - "):
+            block.append(line[4:].strip())
+        else:
+            break
+    assert block == ["alpha", "mango", "zebra"]
+    _ = tag_lines  # silence linter
+
+
+# ---------------------------------------------------------------------------
+# README rewriting + content
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_hf_readme_text_rewrites_links_and_tree_diagram() -> None:
+    readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8")
+    rewritten = packager._hf_readme_text(readme)
+
+    # Source-repo tree → upload tree.
+    assert "intermediate_instructor/" not in rewritten
+    assert "notebooks/01_baseline_lead_scoring.ipynb" not in rewritten
+    assert "README.md                         # this file (HF dataset card)" in rewritten
+
+    # Relative ../ links rewritten to GitHub blob URLs.
+    assert "](../" not in rewritten
+    assert packager.GITHUB_BLOB_BASE in rewritten
+
+    # The validation-report link (which lives under release/, not under
+    # the upload dir) must point at GitHub.
+    assert "](validation/validation_report.md)" not in rewritten
+    assert f"]({packager.GITHUB_BLOB_BASE}/release/validation/validation_report.md)" in rewritten
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_hf_tree_block_source_present_in_release_readme() -> None:
+    """Silent-failure guard.
+
+    ``_hf_readme_text`` substitutes ``HF_TREE_BLOCK_SOURCE`` →
+    ``HF_UPLOAD_TREE_BLOCK`` via plain string replace.  If anyone
+    tweaks the README's tree diagram by even one whitespace
+    character, the substitution silently no-ops and the published HF
+    dataset card carries the source-repo tree.  This guard fires
+    loudly the moment the constants drift apart.
+    """
+
+    readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8")
+    assert packager.HF_TREE_BLOCK_SOURCE in readme, (
+        "scripts/package_hf_release.py HF_TREE_BLOCK_SOURCE no longer matches "
+        "the tree diagram in release/README.md — reconcile the two before "
+        "the next HF README regeneration."
+    )
+
+
+def test_validate_readme_substitution_flags_drift(tmp_path: Path) -> None:
+    """``_validate_readme_substitution`` is wired into the run-time
+    validator, not just the static guard above."""
+
+    fake_release = tmp_path / "release"
+    fake_release.mkdir()
+    (fake_release / "README.md").write_text("# Some unrelated README\n", encoding="utf-8")
+    errors = packager._validate_readme_substitution(fake_release)
+    assert errors
+    assert errors[0].field == "release/README.md"
+    assert "HF_TREE_BLOCK_SOURCE" in errors[0].message
+
+    if _RELEASE_BUNDLES_PRESENT:
+        # Sanity: the real release README does NOT trigger the validator.
+        assert packager._validate_readme_substitution(_RELEASE_DIR) == []
+
+
+# ---------------------------------------------------------------------------
+# Upload-dir assembly safety + idempotence
+# ---------------------------------------------------------------------------
+
+
+def test_assemble_upload_dir_rejects_unsafe_huggingface_dir(tmp_path: Path) -> None:
+    """Refuse to assemble into the release dir or its parent."""
+
+    fake_release = tmp_path / "release"
+    fake_release.mkdir()
+    with pytest.raises(ValueError, match="unsafe"):
+        packager.assemble_upload_dir(fake_release, fake_release)
+    with pytest.raises(ValueError, match="unsafe"):
+        packager.assemble_upload_dir(fake_release, fake_release.parent)
+
+
+def test_assemble_upload_dir_rejects_huggingface_dir_equal_to_cwd(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Refuse to assemble into the current working directory.
+
+    A user passing ``--huggingface-dir .`` would otherwise rmtree-then-
+    recopy arbitrary cwd contents.  Same safety case as the Kaggle
+    packager (PR 5.1 fix-up).
+    """
+
+    fake_release = tmp_path / "release"
+    fake_release.mkdir()
+    cwd = tmp_path / "workdir"
+    cwd.mkdir()
+    monkeypatch.chdir(cwd)
+    with pytest.raises(ValueError, match="unsafe"):
+        packager.assemble_upload_dir(fake_release, cwd)
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_assembled_upload_dir_resolves_every_declared_data_file(tmp_path: Path) -> None:
+    """Every ``configs[*].data_files[*].path`` declared in the YAML
+    must resolve to a real file (not a symlink, not a missing path)
+    under the assembled upload directory.  HF's ``datasets`` library
+    walks the directory at upload time; a declared path that doesn't
+    materialise is a silent load-time failure.
+    """
+
+    upload_dir = tmp_path / "huggingface"
+    outcome = packager.run_packager(
+        _RELEASE_DIR,
+        huggingface_dir=upload_dir,
+        cover_image=_COMMITTED_COVER,
+    )
+
+    for config in outcome.card.configs:
+        for df in config.data_files:
+            target = upload_dir / df.path
+            assert target.is_file(), f"declared data_file missing from upload tree: {df.path}"
+            assert not target.is_symlink(), (
+                f"declared data_file is a symlink, not a real file: {df.path} — "
+                f"datasets library may skip symlinked entries on upload"
+            )
+
+    # Top-level required artefacts.
+    assert (upload_dir / "README.md").is_file()
+    assert (upload_dir / packager.COVER_IMAGE_FILENAME).is_file()
+    assert (upload_dir / "LICENSE").is_file()
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_idempotent_against_existing_tree(tmp_path: Path) -> None:
+    """Re-running the assembly over an already-populated upload tree
+    succeeds — both passes call the same copy helpers."""
+
+    upload_dir = tmp_path / "huggingface"
+    packager.run_packager(_RELEASE_DIR, huggingface_dir=upload_dir, cover_image=_COMMITTED_COVER)
+    outcome = packager.run_packager(
+        _RELEASE_DIR, huggingface_dir=upload_dir, cover_image=_COMMITTED_COVER
+    )
+    assert outcome.errors == ()
+    for config in outcome.card.configs:
+        for df in config.data_files:
+            assert (upload_dir / df.path).is_file()
+
+
+# ---------------------------------------------------------------------------
+# CLI driver — error paths
+# ---------------------------------------------------------------------------
+
+
+def test_main_reports_missing_release_dir(
+    tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    rc = packager.main(
+        [
+            "--release-dir",
+            str(tmp_path / "missing"),
+            "--huggingface-dir",
+            str(tmp_path / "hf"),
+            "--cover-image",
+            str(tmp_path / "cover.png"),
+            "--dry-run",
+        ]
+    )
+    captured = capsys.readouterr()
+    assert rc == 2
+    assert "release directory not found" in captured.err
+
+
+# ---------------------------------------------------------------------------
+# Determinism + sync with committed artefact
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_readme_is_byte_deterministic(tmp_path: Path) -> None:
+    """Two back-to-back runs against the committed bundles must produce
+    byte-identical README files."""
+
+    out_a = tmp_path / "a"
+    out_b = tmp_path / "b"
+    packager.run_packager(
+        _RELEASE_DIR, huggingface_dir=out_a, cover_image=_COMMITTED_COVER, dry_run=True
+    )
+    packager.run_packager(
+        _RELEASE_DIR, huggingface_dir=out_b, cover_image=_COMMITTED_COVER, dry_run=True
+    )
+    assert (out_a / "README.md").read_bytes() == (out_b / "README.md").read_bytes()
+
+
+@pytest.mark.skipif(
+    not (_RELEASE_BUNDLES_PRESENT and _COMMITTED_README.exists()),
+    reason="release bundles or committed README missing",
+)
+def test_committed_hf_readme_matches_fresh_regeneration(tmp_path: Path) -> None:
+    """A fresh HF README regeneration must match the committed
+    ``release/huggingface/README.md`` byte-for-byte AND have a
+    non-degenerate body.
+
+    If this fails, ``release/`` drifted without re-running
+    ``scripts/package_hf_release.py``.  Regenerate via that script
+    from the repo root and commit the new README alongside the bundle
+    change.
+    """
+
+    fresh_dir = tmp_path / "huggingface"
+    packager.run_packager(
+        _RELEASE_DIR,
+        huggingface_dir=fresh_dir,
+        cover_image=_COMMITTED_COVER,
+        dry_run=True,
+    )
+    fresh_bytes = (fresh_dir / "README.md").read_bytes()
+    committed_bytes = _COMMITTED_README.read_bytes()
+    assert fresh_bytes == committed_bytes
+
+    # Positive content assertions — guard against the failure mode
+    # where a code change accidentally produces empty / minimal
+    # content that we then re-commit.
+    text = fresh_bytes.decode("utf-8")
+    # YAML frontmatter parses.
+    _, fm_text, body = text.split("---", 2)
+    fm = yaml.safe_load(fm_text)
+    assert fm["license"] == "mit"
+    assert fm["language"] == ["en"]
+    assert fm["task_categories"] == ["tabular-classification"]
+    assert sorted(fm["tags"]) == fm["tags"]  # rendered sorted
+    assert len(fm["configs"]) == 3
+    defaults = [c for c in fm["configs"] if c.get("default")]
+    assert len(defaults) == 1
+    assert defaults[0]["config_name"] == "intermediate"
+    # Body inherited the rewritten release card content.
+    assert "What's inside" in body
+    assert "Why lead scoring matters" in body
+    assert "Known limitations" in body
+    assert "intermediate_instructor/" not in body  # tree-diagram rewrite fired
+    assert "](../" not in body  # parent-link rewrite fired
+    assert packager.GITHUB_BLOB_BASE in body
+
+
+# ---------------------------------------------------------------------------
+# Instructor companion (G12.4)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not _INSTRUCTOR_BUNDLE_PRESENT, reason="instructor bundle not present")
+def test_run_packager_instructor_variant_packages_independently(tmp_path: Path) -> None:
+    """G12.4 — the instructor variant builds a separate upload tree
+    pointing at ``release/intermediate_instructor/``, with one config
+    (``intermediate``) and ``default: true`` set."""
+
+    upload_dir = tmp_path / "huggingface-instructor"
+    outcome = packager.run_packager(
+        _RELEASE_DIR,
+        huggingface_dir=upload_dir,
+        variant="instructor",
+        cover_image=_COMMITTED_COVER,
+    )
+    assert outcome.errors == ()
+    assert len(outcome.card.configs) == 1
+    only = outcome.card.configs[0]
+    assert only.config_name == "intermediate"
+    assert only.default is True
+    # The intermediate_instructor source dir is flattened to
+    # ``intermediate/`` in the upload tree.
+    for df in only.data_files:
+        assert (upload_dir / df.path).is_file()
+    # Instructor body uses the instructor tree block.
+    body = outcome.card.body
+    assert "metadata/" in body  # instructor metadata sibling visible
+    assert "9 full-horizon tables" in body
+
+
+# ---------------------------------------------------------------------------
+# load_dataset() round-trip (G12.3 + G12.4)
+#
+# Gated on the optional ``datasets`` library — the leadforge dev
+# install does not pull it in (the transitive dependency tail is
+# heavy).  When present, this test exercises the actual HF loader
+# against the assembled upload tree per config and asserts the row
+# counts match the manifest.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+@pytest.mark.parametrize("config_name", ["intro", "intermediate", "advanced"])
+def test_load_dataset_round_trip_per_config(tmp_path: Path, config_name: str) -> None:
+    """G12.3 — ``load_dataset(local_path, name=<config>)`` succeeds for
+    every config and returns the canonical row counts."""
+
+    datasets = pytest.importorskip("datasets", reason="datasets SDK not installed")
+
+    upload_dir = tmp_path / "huggingface"
+    packager.run_packager(_RELEASE_DIR, huggingface_dir=upload_dir, cover_image=_COMMITTED_COVER)
+
+    ds = datasets.load_dataset(str(upload_dir), name=config_name)
+    for hf_split, expected_rows in _EXPECTED_ROWS.items():
+        assert hf_split in ds, f"split {hf_split!r} missing from {config_name!r}"
+        assert len(ds[hf_split]) == expected_rows, (
+            f"{config_name}/{hf_split}: expected {expected_rows} rows, got {len(ds[hf_split])}"
+        )
+
+
+@pytest.mark.skipif(not _INSTRUCTOR_BUNDLE_PRESENT, reason="instructor bundle not present")
+def test_load_dataset_round_trip_instructor(tmp_path: Path) -> None:
+    """G12.4 — instructor companion loads via ``load_dataset()`` for
+    its single config (``intermediate``)."""
+
+    datasets = pytest.importorskip("datasets", reason="datasets SDK not installed")
+
+    upload_dir = tmp_path / "huggingface-instructor"
+    packager.run_packager(
+        _RELEASE_DIR,
+        huggingface_dir=upload_dir,
+        variant="instructor",
+        cover_image=_COMMITTED_COVER,
+    )
+
+    ds = datasets.load_dataset(str(upload_dir), name="intermediate")
+    for hf_split, expected_rows in _EXPECTED_ROWS.items():
+        assert hf_split in ds
+        assert len(ds[hf_split]) == expected_rows
diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py
index 276d01e..351950b 100644
--- a/tests/scripts/test_package_kaggle_release.py
+++ b/tests/scripts/test_package_kaggle_release.py
@@ -175,7 +175,7 @@ def test_validate_cover_image_rejects_too_small_image(tmp_path: Path) -> None:
     errors = packager.validate_cover_image(tiny)
     assert errors
     assert errors[0].field == "cover_image"
-    assert "below Kaggle minimum" in errors[0].message
+    assert "below minimum" in errors[0].message
 
 
 def test_validate_cover_image_reports_missing_file(tmp_path: Path) -> None:

From f2fc4a2861e9db799decdb58ad0b8a37b6fb9131 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Wed, 6 May 2026 23:37:18 +0300
Subject: [PATCH 2/3] PR 5.2 self-review fixes (Kaggle + HF packagers)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fold the brutal self-review's findings back into the PR before review.

Bugs:
- (#1) run_packager validate→write order — both packagers wrote
  README/metadata on validation failure, leaving corrupt artifacts on
  disk that would silently get committed.  Gated on `errors == ()`;
  added no-write tests for both packagers.
- (#2) Instructor README inlined the public 3-tier README into a
  1-tier dataset card.  Replaced with a dedicated `INSTRUCTOR_BODY`
  constant that links to the public dataset and describes only the
  instructor-specific additions (full-horizon tables, hidden DAG,
  latent registry, mechanism summary).
- (#3) validate_upload_dir_safe also blocks strict descendants of
  release_dir; `--huggingface-dir release/intro` would otherwise
  rmtree the intro bundle.

Architecture:
- (#5) Finished shared-primitives extraction: SOURCE_TREE_BLOCK,
  validate_readme_substitution, replace_file, replace_dir,
  load_manifest now live in scripts/_release_common.py.  Both
  packagers reduced to imports.
- (#6) Replaced 60-line hand-rolled YAML renderer with yaml.safe_dump
  + a 4-line _IndentedDumper subclass.
- (#7) Removed dead --owner / --dataset-slug CLI flags.
- (#8) assemble_upload_dir now takes rendered_readme and writes it.
- (#9) build_config_for_tier made pure (no I/O); cheap manifest-stat
  preflight via _assert_tier_dir_exists.
- (#10) --default-config with --variant=instructor errors loudly.

CI:
- (#4) Added [publish] extra (datasets>=2.14, kaggle>=1.6) so the
  gated G12.3 / G12.4 / G11.3 tests install in one line.

Cleanups: visual cruft (#13–#16), test cruft (#17 — unused tmp_path,
dead tag_lines), em-dash YAML round-trip parametrised for the
instructor pretty_name.

Verification: 1223 tests pass + 5 gated skips; ruff + mypy clean;
hash determinism PASS 67/67; leakage probes 0/3 reconstruct on every
tier; validate_release_candidate --no-rebuild exits 0.
release/{kaggle,huggingface,huggingface-instructor}/dataset-metadata
.json|README.md regenerated; audit-artifact-sync tests guard them.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .agent-plan.md                               |   2 +-
 pyproject.toml                               |   9 +
 release/huggingface-instructor/README.md     | 283 +++------
 release/huggingface/README.md                |   2 +-
 scripts/_release_common.py                   | 175 +++++-
 scripts/package_hf_release.py                | 603 ++++++++++---------
 scripts/package_kaggle_release.py            | 180 ++----
 tests/scripts/test_package_hf_release.py     | 211 +++++--
 tests/scripts/test_package_kaggle_release.py |  43 +-
 9 files changed, 847 insertions(+), 661 deletions(-)

diff --git a/.agent-plan.md b/.agent-plan.md
index 874e7be..6e8c359 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -54,7 +54,7 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family
 - [x] PR 5.2: `scripts/_release_common.py` (new) — extracted shared release-packaging primitives so the Kaggle + HF packagers share one source of truth: `GITHUB_BLOB_BASE`, the `](../foo)` regex, the validation-report blob-URL substitution, `validate_cover_image` (dimension floor; HF reuses Kaggle's 560×280), and `validate_upload_dir_safe` (refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor).  `FieldDescriptor` / `ResourceSchema` / dtype maps are intentionally NOT extracted — HF infers schema from parquet via `load_dataset` and does not need a Frictionless declaration.  `scripts/package_kaggle_release.py` refactored to import these primitives; PR 5.1's external behaviour and committed `release/kaggle/dataset-metadata.json` are byte-stable across the refactor.
 - [x] PR 5.2: Upload-dir assembly under `release/huggingface/` and `release/huggingface-instructor/` uses real-file copies (not symlinks; mirrors PR 5.1 — the `datasets` library walks the upload tree and silently skips broken symlinks in some versions).  Cover image at `release/dataset-cover-image.png` is reused for the HF thumbnail.  `release/huggingface/*` and `release/huggingface-instructor/*` are gitignored except for `README.md` — only the dataset card is committed; the upload tree is reassembled on demand.
 - [x] PR 5.2: `release/HF_DATASET_CARD.md` (legacy single-file stub) deleted — superseded by the generated `release/huggingface/README.md`.
-- [x] PR 5.2: 21 new tests (`tests/scripts/test_package_hf_release.py`): every G12.1 YAML field, G12.2 exactly-one-default invariant, YAML round-trip via `yaml.safe_load`, tags-sorted-at-render-time, README rewriting (tree + `../` + validation report links), `HF_TREE_BLOCK_SOURCE` silent-failure guard, unsafe-dir rejection (release dir / parent / `cwd`), assembled-tree resource resolution, idempotent re-assembly, byte-determinism (audit-artifact-sync), committed-README-matches-fresh-regeneration sync check, instructor-variant packaging (G12.4: single config, flattened source dir), and a parametrised `load_dataset()` round-trip per config gated on the optional `datasets` SDK with row-count assertions (3500 / 750 / 750 train / valid / test) covering G12.3 + G12.4.  1216/1216 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape).
+- [x] PR 5.2 (self-review pass): brutal review of the first revision caught real bugs the reviewer would otherwise have to call out. Fixes folded into the PR before review:  (#1) `run_packager` validate→write order — both packagers were writing the README/metadata even when validation failed; the validation gate now early-returns with `errors` populated and zero artifacts on disk; new tests exercise the no-write path on both sides.  (#2) Instructor README was inlining the public 3-tier README for a 1-tier dataset; replaced with a dedicated `INSTRUCTOR_BODY` constant that opens by linking to the public dataset, describes only the instructor-specific additions (full-horizon tables, hidden DAG, latent registry, mechanism summary), and uses the single-tier tree block.  (#3) `validate_upload_dir_safe` now also rejects strict descendants of `release_dir` (e.g. `--huggingface-dir release/intro` would otherwise rmtree the intro bundle); allow-list keeps the canonical `release/{kaggle,huggingface,huggingface-instructor}` direct-children.  (#4) `[publish]` extra in `pyproject.toml` (`datasets>=2.14`, `kaggle>=1.6`) makes the gated `load_dataset()` / Kaggle-CLI tests installable in a single command — closes the "G12.3/G12.4 untested in CI" gap to a one-line install.  (#5) Shared-primitives extraction finished: `SOURCE_TREE_BLOCK`, `validate_readme_substitution`, `replace_file`, `replace_dir`, `load_manifest` all moved to `scripts/_release_common.py`; both packagers reduced to imports.  (#6) Hand-rolled YAML renderer (60 lines + brittle quoting heuristic) replaced with `yaml.safe_dump` + a 4-line `_IndentedDumper` subclass that forces indent-2 on top-level sequences.  (#7) Dead `--owner` / `--dataset-slug` CLI flags removed (PR 7.2 will add them when actually needed).  (#8) `assemble_upload_dir` now takes `rendered_readme` as a parameter and writes it itself; the public name no longer lies about producing a complete tree.  (#9) `build_config_for_tier` made pure (no I/O); `_assert_tier_dir_exists` does the cheap manifest-stat preflight.  (#10) `--default-config` with `--variant=instructor` now errors instead of silently ignoring.  (#11) Instructor tree-diagram drops the hardcoded "9 tables" claim.  (#13–#16) Visual cleanups (duplicate divider, ruff-split imports, `COVER_IMAGE_FILENAME`-vs-`Path.name` redundancy, speculative comment about HF split rename).  (#17) Test cruft removed (unused `tmp_path`, dead `tag_lines`); em-dash YAML round-trip parametrised for the instructor `pretty_name`.  Net: 1223/1223 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape).
 
 ### Phase 6 — Notebook sequence + adversarial framing
 - [ ] `release/notebooks/{02_relational_feature_engineering,03_leakage_and_time_windows,04_lift_calibration_value_ranking}.ipynb`
diff --git a/pyproject.toml b/pyproject.toml
index 29bacba..b7f1de8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,15 @@ scripts = [
     "scikit-learn>=1.3",
     "matplotlib>=3.7",
 ]
+# Optional dependencies for the platform release packagers.  Installing
+# this extra (``pip install -e ".[publish]"``) enables the gated
+# ``load_dataset()`` / Kaggle-CLI smoke tests that verify G11.3 (Kaggle
+# package) and G12.3 / G12.4 (HF load_dataset round-trip) without
+# pulling the heavy SDKs into the default dev install.
+publish = [
+    "datasets>=2.14",
+    "kaggle>=1.6",
+]
 
 [project.scripts]
 leadforge = "leadforge.cli.main:app"
diff --git a/release/huggingface-instructor/README.md b/release/huggingface-instructor/README.md
index 3fadd54..6725379 100644
--- a/release/huggingface-instructor/README.md
+++ b/release/huggingface-instructor/README.md
@@ -1,5 +1,5 @@
 ---
-pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion"
+pretty_name: 'LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion'
 license: mit
 language:
   - en
@@ -27,32 +27,24 @@ configs:
         path: intermediate/tasks/converted_within_90_days/test.parquet
 ---
 
-# LeadForge: Synthetic B2B Lead Scoring Dataset (`leadforge-lead-scoring-v1`)
+# LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion
 
-A relational, reproducible, three-tier synthetic CRM dataset family for
-teaching lead scoring at scale. Generated by
-[leadforge](https://github.com/leadforge-dev/leadforge), an
-open-source Python framework for synthetic CRM/funnel data. The
-framework version is decoupled from the dataset version: the package
-stays at `1.x`; the dataset is published under the explicit `…-v1`
-tag.
+This is the **research / instructor companion** to the public
+[`leadforge/leadforge-lead-scoring-v1`](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1)
+dataset.  It exposes the **full-horizon** view of a single difficulty
+tier (`intermediate`) plus the **hidden causal structure** that the
+public dataset deliberately redacts: the world graph (DAG), latent
+trait registry, mechanism summary, and full-horizon relational tables
+including `customers` and `subscriptions`.
 
-## Why lead scoring matters in 2024–2026
+It exists for instructors who want to walk students through how the
+public dataset was generated, and for researchers who want to verify
+that the public redactions actually remove the leakage paths the
+dataset advertises.  **It is not a replacement for the public dataset
+in any teaching or modelling context** — students should still train
+on the public bundle.
 
-Mid-market SaaS vendors entered 2024–2026 with growth slowing and
-customer-acquisition costs rising[^macro], so predicting *which* leads
-convert within a fixed window has moved from a marketing nicety to a
-survival skill. This dataset teaches that skill on a relational
-substrate, with the realistic confusions (snapshot-window discipline,
-leakage traps, channel signal weaker than vendor blogs imply) that
-students will hit when they finally get hands on real CRM data.
-
-[^macro]: Macroeconomic framing summarised in
-[`docs/external_review/summaries/gemini_v2_summary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/external_review/summaries/gemini_v2_summary.md)
-(median public-SaaS growth 30%→25% from 2023 to 2025; New CAC Ratio
-rose materially in 2024).
-
-## What's inside
+## What this companion contains
 
 ```
 .
@@ -60,7 +52,7 @@ rose materially in 2024).
 │   ├── manifest.json                 # provenance + file hashes
 │   ├── dataset_card.md               # auto-rendered per-bundle card
 │   ├── feature_dictionary.csv        # authoritative column spec
-│   ├── tables/*.parquet              # 9 full-horizon tables (incl. customers, subscriptions)
+│   ├── tables/*.parquet              # full-horizon tables (incl. customers, subscriptions)
 │   ├── tasks/converted_within_90_days/{train,valid,test}.parquet
 │   └── metadata/                     # world_spec, graph.{graphml,json}, latent_registry, etc.
 ├── README.md                         # this file (HF dataset card)
@@ -68,183 +60,95 @@ rose materially in 2024).
 └── LICENSE
 ```
 
-`student_public` bundles ship the snapshot-safe relational view;
-`research_instructor` companions ship the full-horizon view plus the
-hidden causal structure (DAG, latent registry, mechanism summary)
-under `metadata/`. The full layout is documented in each bundle's
-`manifest.json`.
+The single ``intermediate`` config exposes the same train/valid/test
+parquet splits as the public dataset's ``intermediate`` config — same
+seeds, same row counts (3,500 / 750 / 750), same target.  The
+difference lives in the relational tables and metadata:
+
+| File | Public `intermediate` | Instructor companion |
+|---|---|---|
+| `tables/leads.parquet` | redacted (label dropped) | full (label retained) |
+| `tables/opportunities.parquet` | snapshot-filtered + redacted | full-horizon, full columns |
+| `tables/customers.parquet` | omitted (would leak label) | included |
+| `tables/subscriptions.parquet` | omitted (would leak label) | included |
+| `tables/touches.parquet` etc. | filtered to ≤ snapshot day | full 90-day horizon |
+| `metadata/world_spec.json` | absent | included (DGP + recipe) |
+| `metadata/graph.{graphml,json}` | absent | included (hidden DAG) |
+| `metadata/latent_registry.json` | absent | included (latent traits) |
+| `metadata/mechanism_summary.json` | absent | included (per-edge mechanisms) |
+
+The redaction contract is single-sourced in
+[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py)
+and re-applied by
+[`leadforge/render/relational_snapshot_safe.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/render/relational_snapshot_safe.py)
+when the public bundle is built; this companion is the unfiltered
+source view, so the two are always consistent by construction.
 
 ## Quick start
 
 ```python
-# Flat CSV
-df = pd.read_csv("intermediate/lead_scoring.csv")
-
-# Parquet task splits (recommended)
-train = pd.read_parquet("intermediate/tasks/converted_within_90_days/train.parquet")
-test  = pd.read_parquet("intermediate/tasks/converted_within_90_days/test.parquet")
+from datasets import load_dataset
 
-# Relational tables (feature engineering — example)
-leads   = pd.read_parquet("intermediate/tables/leads.parquet")
-touches = pd.read_parquet("intermediate/tables/touches.parquet")
-my_touch_count = (
-    touches.groupby("lead_id").size().rename("my_touch_count").reset_index()
+# Loads the same train/valid/test splits as the public 'intermediate'
+# config; differs only in what `tables/` and `metadata/` provide.
+ds = load_dataset(
+    "leadforge/leadforge-lead-scoring-v1-instructor",
+    name="intermediate",
+)
+train = ds["train"].to_pandas()
+
+# Full-horizon relational tables — includes customers and subscriptions
+# (omitted from the public dataset because their existence reconstructs
+# the conversion label).
+import pandas as pd
+customers = pd.read_parquet(
+    "hf://datasets/leadforge/leadforge-lead-scoring-v1-instructor/intermediate/tables/customers.parquet"
 )
-features = leads.merge(my_touch_count, on="lead_id", how="left")
-
-# Reproduce from source
-# pip install leadforge
-# leadforge generate --recipe b2b_saas_procurement_v1 --seed 42 \
-#                    --mode student_public --difficulty intermediate --out my_bundle
 ```
 
-The label `converted_within_90_days` resolves over a 90-day window;
-engagement features (`touch_count`, `session_count`, etc.) are
-computed strictly over events on days `[0, 30]`. The deliberate
-exception is `total_touches_all`, the leakage trap — flagged
-`leakage_risk=True` in `feature_dictionary.csv`. Drop it from your
-feature set unless you're demonstrating leakage detection.
-
-## Dataset summary
-
-| | Intro | Intermediate | Advanced |
-|---|---|---|---|
-| Leads | 5,000 | 5,000 | 5,000 |
-| Accounts | 1,500 | 1,500 | 1,500 |
-| Contacts | 4,200 | 4,200 | 4,200 |
-| Snapshot columns | 32 / 34* | 32 / 34* | 32 / 34* |
-| Target | `converted_within_90_days` | `converted_within_90_days` | `converted_within_90_days` |
-| Conversion rate (recipe band) | 24–61% | 12–31% | 4–12% |
-| Conversion rate (median, seeds 42–46) | 42.67% | 21.60% | 8.40% |
-| Signal strength | 0.90 | 0.70 | 0.50 |
-| Noise scale | 0.10 | 0.30 | 0.55 |
-| Missing rate | 2% | 8% | 18% |
-
-\* `student_public` / `research_instructor`. Difficulty is modulated
-by the simulation engine — signal strength on latent-trait weights,
-Gaussian noise on float features, MCAR missingness, outlier rate —
-not post-hoc label flipping.
-
-## The scenario
-
-**Veridian Technologies** is a fictional Series B startup (Austin, US)
-selling **Veridian Procure**, a procurement / AP automation SaaS, to
-mid-market firms (200–2,000 employees) in the US and UK. The funnel
-runs through inbound marketing (45%), SDR outbound (35%), and
-partner referrals (20%); four personas drive deals (VP Finance, AP
-Manager, IT Director, Procurement Manager). **Task:** predict whether
-a lead converts (`closed_won`) within 90 days. ACV bands are
-$18k–$120k. See
-[`docs/release/generation_method.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/generation_method.md)
-for the full DGP, and the deeper "what's modelled / approximate / not
-modelled" breakdown that this README only summarises.
-
-## Public vs instructor: what's redacted
-
-Filtering happens **during rendering**, not during simulation. The
-redaction contract is single-sourced in
-[`leadforge/validation/leakage_probes.py`](https://github.com/leadforge-dev/leadforge/blob/main/leadforge/validation/leakage_probes.py);
-the snapshot-safe writer and the validator import the same constants,
-so they cannot drift apart.
-
-| Source-of-truth constant | Public bundle treatment |
-|---|---|
-| `BANNED_LEAD_COLUMNS = ("converted_within_90_days", "conversion_timestamp")` | Dropped from `tables/leads.parquet` |
-| `BANNED_OPP_COLUMNS = ("close_outcome", "closed_at")` | Dropped from `tables/opportunities.parquet` |
-| `BANNED_TABLES = ("customers", "subscriptions")` | Omitted from public bundles |
-| `SNAPSHOT_FILTERED_TABLES` (touches, sessions, sales_activities, opportunities) | Filtered per-lead by `lead_created_at + snapshot_day` |
-| Snapshot redaction (`current_stage`, `is_sql`) | Stripped from `tasks/` splits and `tables/leads.parquet` |
-| `total_touches_all` (deliberate trap) | **Retained in both modes**; flagged `leakage_risk=True` |
-
-Each bundle's `manifest.json` records `relational_snapshot_safe`,
-`redacted_columns`, and `snapshot_day`, so the bundle is
-self-describing.
-
-## Calibration
-
-Every realism / calibration / difficulty claim in this README is
-backed by
-[`validation/validation_report.md`](https://github.com/leadforge-dev/leadforge/blob/main/release/validation/validation_report.md),
-regenerated by
-[`scripts/validate_release_candidate.py`](https://github.com/leadforge-dev/leadforge/blob/main/scripts/validate_release_candidate.py)
-with bands declared in
-[`docs/release/v1_acceptance_gates_bands.yaml`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/v1_acceptance_gates_bands.yaml).
-Headline cross-seed medians (seeds 42–46):
-
-| Tier | LR AUC | AP | P@100 | Brier |
-|---|---|---|---|---|
-| intro | 0.879 | 0.761 | 0.80 | 0.130 |
-| intermediate | 0.886 | 0.575 | 0.59 | 0.110 |
-| advanced | 0.886 | 0.351 | 0.34 | 0.061 |
-
-AP, P@100, conversion-rate, and lift orderings hold across the
-intended difficulty axis (intro > intermediate > advanced).
-
 ## Intended uses
 
-- Teaching baseline lead-scoring on a flat snapshot.
-- Teaching relational feature engineering against snapshot-safe tables.
-- Teaching leakage detection (the `total_touches_all` trap is
-  designed to be discoverable).
-- Teaching calibration, lift, P@K, value-aware ranking
-  (`expected_acv × P(convert)`), and cohort-shift evaluation.
-- Comparing model families under a controlled DGP.
+- Teaching the **public-vs-instructor split** itself: load both
+  datasets side-by-side, show students which columns and tables were
+  redacted, and walk through why each was a leakage path.
+- **Verifying the redaction contract:** train a model on the
+  full-horizon tables, train another on the snapshot-safe public
+  tables, compare AUC.  The gap is the redaction's effect.
+- Teaching **causal structure and DGP transparency** using
+  `metadata/world_spec.json` + `metadata/graph.json`.
+- Reproducing the public dataset from the instructor view via
+  [`leadforge`](https://github.com/leadforge-dev/leadforge/blob/main) source code.
 
 ## Out-of-scope uses
 
-- **Production lead scoring.** The company, product, and customers are
-  fictional.
-- **Vendor benchmarking / paper baselines.** Difficulty tiers are
-  calibrated for pedagogy, not cross-paper comparability.
-- **Causal-inference research that requires recovery of the true DGP.**
-  The instructor companion exposes the hidden graph for teaching, not
-  designed counterfactuals.
-- **Demographic / fairness research.** v1 does not model protected
+- **Production lead scoring.**  Same as the public dataset; the
+  company, product, and customers are fictional.
+- **Modelling with the unredacted view as a baseline.**  Models
+  trained against the full-horizon tables look strong because they're
+  directly seeing post-conversion events.  That number is not a
+  baseline; it's the ceiling.
+- **Demographic / fairness research.**  v1 does not model protected
   attributes.
 
-## Known limitations
-
-- **Difficulty signal on raw AUC is flat.** LR AUC is ~0.88 across
-  every tier. Difficulty is visible in AP, P@K, Brier, and value
-  capture. Treat AUC as a sanity check, not a difficulty signal.
-- **GBM does not consistently beat LR (gate G7.4.4).** GBM−LR AUC delta
-  is slightly negative in every tier (intro −0.0045, intermediate
-  −0.0072, advanced −0.0133); v1's snapshot is dominated by linear
-  features. v2 will inject non-linear interactions in the simulator.
-- **Channel signal is weak.** Per
-  [`docs/release/channel_signal_audit.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/channel_signal_audit.md),
-  out-of-sample univariate AUC of `lead_source` is ≈0.50–0.52 across
-  all tiers and the per-channel rate spread is ≤0.05. The simulator
-  does not encode channel-conditional probabilities; channel-conditional
-  encoding is post-v1 work.
-- **Cohort-shift degradation is small.** v1 has no time-of-year drift
-  baked in; the cohort-shift gate (G6.4) is informational and will
-  bite in v2.
-
 ## Composition
 
-- **Entities.** Accounts, contacts, leads, touches, sessions,
-  sales_activities, opportunities (public); plus customers and
-  subscriptions (instructor only). Per-row counts per bundle live in
-  `manifest.json`.
-- **Features.** 32 public columns grouped by analytical role in
-  [`docs/release/feature_dictionary.md`](https://github.com/leadforge-dev/leadforge/blob/main/docs/release/feature_dictionary.md);
-  the per-bundle `feature_dictionary.csv` is the authoritative
-  machine-readable spec.
-- **Label.** `converted_within_90_days` (boolean), event-derived from
-  the simulator. Never sampled directly.
-- **Splits.** 70/15/15 train/valid/test, deterministic given seed;
-  recorded in `tasks/converted_within_90_days/task_manifest.json`.
-- **Provenance.** Recipe `b2b_saas_procurement_v1`, seed 42, package
-  version stamped in `manifest.json`.
-
-## Maintenance, adversarial framing, license
-
-We *want* the dataset to be broken. Issue templates ship under
-`.github/ISSUE_TEMPLATE/` (Phase 6); the break-me guide lands as
-`docs/release/break_me_guide.md` (PR 6.3). Once Phase 6 ships,
-`docs/release/v2_decision_log.md` will track every accepted finding
-and the design call that came from it. File issues at
+- **Entities.**  9 relational tables (accounts, contacts, leads,
+  touches, sessions, sales_activities, opportunities, customers,
+  subscriptions); per-row counts in `manifest.json`.
+- **Splits.**  Identical to the public `intermediate` config: 70/15/15
+  train/valid/test, deterministic given seed 42, recorded in
+  `tasks/converted_within_90_days/task_manifest.json`.
+- **Provenance.**  Recipe `b2b_saas_procurement_v1`, seed 42, package
+  version stamped in `manifest.json` along with SHA-256 hashes for
+  every parquet file.
+- **Bundle schema version.**  5 (matches the public dataset).
+
+## Maintenance, license
+
+We *want* the dataset to be broken.  See the
+[public dataset card](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1)
+for the adversarial-framing pointers, the issue templates, and the
+break-me guide.  File issues at
 [leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);
 PRs welcome.
 
@@ -252,10 +156,11 @@ PRs welcome.
 |---|---|
 | Generator | leadforge `1.0.0+` |
 | Recipe | `b2b_saas_procurement_v1` |
-| Canonical seed | 42 (cross-seed sweep: 42–46) |
+| Canonical seed | 42 |
 | Bundle schema version | 5 |
-| Format | Parquet (canonical) + CSV (convenience) |
+| Format | Parquet (canonical) |
 | License | MIT — see [LICENSE](LICENSE) |
+| Public dataset | [link](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) |
 
-Verify integrity with `leadforge validate <bundle_dir>`; every file
-is hashed in `manifest.json`.
+Verify integrity with `leadforge validate <bundle_dir>`; every file is
+hashed in `manifest.json`.
diff --git a/release/huggingface/README.md b/release/huggingface/README.md
index cdbcf05..885e834 100644
--- a/release/huggingface/README.md
+++ b/release/huggingface/README.md
@@ -1,5 +1,5 @@
 ---
-pretty_name: "LeadForge: Synthetic B2B Lead Scoring (v1)"
+pretty_name: 'LeadForge: Synthetic B2B Lead Scoring (v1)'
 license: mit
 language:
   - en
diff --git a/scripts/_release_common.py b/scripts/_release_common.py
index b1b61f5..c57e71e 100644
--- a/scripts/_release_common.py
+++ b/scripts/_release_common.py
@@ -10,13 +10,17 @@
 * rewrite the ``](validation/validation_report.md)`` link to a GitHub
   blob URL — the validation report does not ship with either upload
   tree;
-* refuse to assemble into ``cwd`` / the release dir / its parent / the
-  filesystem anchor (the assembler rmtrees children of the upload dir);
+* refuse to assemble into ``cwd`` / the release dir / its parent / a
+  descendant of the release dir / the filesystem anchor (the assembler
+  rmtrees children of the upload dir);
 * validate the dataset cover image's dimensions against Kaggle's floor
-  (HF reuses the same PNG for its thumbnail).
+  (HF reuses the same PNG for its thumbnail);
+* perform idempotent file/dir replacement during upload-tree assembly;
+* read tier ``manifest.json`` payloads;
+* guard the source-side tree diagram in ``release/README.md`` against
+  drifting away from the platform packagers' ``replace()`` substring
+  (silent-failure trap).
 
-Lifting the four to one module keeps the rules in one place — if Kaggle
-ever extends the dimension floor, both packagers pick it up.
 ``FieldDescriptor`` / ``ResourceSchema`` / dtype maps are deliberately
 NOT extracted: HF infers schema from parquet via ``load_dataset`` and
 does not need a Frictionless-shaped declaration.
@@ -24,10 +28,12 @@
 
 from __future__ import annotations
 
+import json
 import re
+import shutil
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Final
+from typing import Any, Final
 
 from PIL import Image
 
@@ -62,15 +68,9 @@ def rewrite_release_links(text: str) -> str:
 
 
 # ---------------------------------------------------------------------------
-# Cover-image validation (G11.2; HF reuses the same asset)
+# Validation error type — declared before any validator that returns it
 # ---------------------------------------------------------------------------
 
-#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with
-#: 2:1 header / 1:1 thumbnail crops in mind. HF accepts any reasonable
-#: dimension for thumbnails so the Kaggle floor is the binding one.
-COVER_IMAGE_MIN_WIDTH: Final[int] = 560
-COVER_IMAGE_MIN_HEIGHT: Final[int] = 280
-
 
 @dataclass(frozen=True)
 class ValidationError:
@@ -80,6 +80,72 @@ class ValidationError:
     message: str
 
 
+# ---------------------------------------------------------------------------
+# Source-side tree-diagram guard
+#
+# ``release/README.md`` (PR 4.1) contains a "What's inside" tree
+# diagram describing the SOURCE-REPO layout. Each packager substitutes
+# this block for its own upload-tree diagram via ``str.replace``.
+# Keeping the source-side block as a single shared constant means the
+# README's tree diagram drifting away from the constant is caught
+# loudly by the same guard regardless of which packager runs first.
+# ---------------------------------------------------------------------------
+
+SOURCE_TREE_BLOCK: Final[str] = """```
+release/
+├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
+│   ├── manifest.json                 # provenance + file hashes
+│   ├── dataset_card.md               # auto-rendered per-bundle card
+│   ├── feature_dictionary.csv        # authoritative column spec
+│   ├── lead_scoring.csv              # flat convenience CSV (all splits)
+│   ├── tables/*.parquet              # 7 snapshot-safe relational tables
+│   └── tasks/converted_within_90_days/{train,valid,test}.parquet
+├── intermediate_instructor/          # research companion: full-horizon tables + metadata/
+├── notebooks/01_baseline_lead_scoring.ipynb
+└── validation/                       # validation_report.{json,md} + figures
+```"""
+
+
+def validate_readme_substitution(release_dir: Path, *, packager_name: str) -> list[ValidationError]:
+    """Guard against silent drift between the README's tree diagram
+    and ``SOURCE_TREE_BLOCK``.
+
+    Plain string ``replace()`` silently no-ops when the substring is
+    not found, which would publish the source-repo tree diagram on the
+    target platform. ``packager_name`` is folded into the error message
+    so the trace says which packager noticed the drift first.
+    """
+
+    readme = release_dir / "README.md"
+    if not readme.exists():
+        return []  # No README is itself a release-day issue, but not this validator's concern.
+    if SOURCE_TREE_BLOCK not in readme.read_text(encoding="utf-8"):
+        return [
+            ValidationError(
+                field="release/README.md",
+                message=(
+                    f"SOURCE_TREE_BLOCK not found verbatim in release/README.md; "
+                    f"the source-repo tree diagram in the README has drifted from "
+                    f"the constant in scripts/_release_common.py — the "
+                    f"{packager_name} README rewrite will silently no-op until "
+                    f"the README and SOURCE_TREE_BLOCK are reconciled."
+                ),
+            )
+        ]
+    return []
+
+
+# ---------------------------------------------------------------------------
+# Cover-image validation (G11.2; HF reuses the same asset)
+# ---------------------------------------------------------------------------
+
+#: Cover-image minimum dimensions per G11.2: 560 × 280 minimum, with
+#: 2:1 header / 1:1 thumbnail crops in mind. HF accepts any reasonable
+#: dimension for thumbnails so the Kaggle floor is the binding one.
+COVER_IMAGE_MIN_WIDTH: Final[int] = 560
+COVER_IMAGE_MIN_HEIGHT: Final[int] = 280
+
+
 def validate_cover_image(path: Path) -> list[ValidationError]:
     """Validate that ``path`` exists and meets the dimension floor."""
 
@@ -116,18 +182,89 @@ def validate_upload_dir_safe(upload_dir: Path, release_dir: Path, *, kind: str)
     """Refuse to assemble into a path that aliases something dangerous.
 
     The packagers replace children of ``upload_dir`` (rmtree + recopy)
-    so pointing it at ``cwd`` / ``release_dir`` / their parents / the
-    filesystem anchor would clobber unrelated content. ``kind`` is
-    folded into the error message so the trace says which packager
-    refused (``kaggle`` / ``huggingface`` / ``huggingface-instructor``).
+    so pointing it at any of the following would clobber unrelated
+    content:
+
+    * ``cwd`` or its anchor (``/`` on POSIX);
+    * ``release_dir`` itself;
+    * a parent of ``release_dir``;
+    * a descendant of ``release_dir`` other than the canonical platform
+      output dirs (e.g. ``release/intro`` would rmtree the intro
+      bundle).
+
+    The safe location for an upload tree is a sibling of ``release/``
+    or a fresh subdirectory of ``release/`` that the packager owns
+    (e.g. ``release/kaggle/``, ``release/huggingface/``).  We allow
+    those by checking the resolved path against the explicit blocklist
+    and against descendant containment in ``release_dir``.
+
+    ``kind`` is folded into the error message so the trace says which
+    packager refused (``kaggle`` / ``huggingface`` /
+    ``huggingface-instructor``).
     """
 
     resolved = upload_dir.resolve()
+    release_resolved = release_dir.resolve()
     blocked = {
         Path(resolved.anchor),
         Path.cwd().resolve(),
-        release_dir.resolve(),
-        release_dir.resolve().parent,
+        release_resolved,
+        release_resolved.parent,
     }
     if resolved in blocked:
         raise ValueError(f"refusing to assemble into unsafe --{kind}-dir: {upload_dir}")
+    # Disallow descendants of release_dir other than direct children
+    # owned by a packager. ``resolved.parent == release_resolved`` is
+    # the canonical ok-case (release/kaggle, release/huggingface,
+    # release/huggingface-instructor); deeper nesting would alias a
+    # tier bundle dir.
+    if resolved.is_relative_to(release_resolved) and resolved.parent != release_resolved:
+        raise ValueError(
+            f"refusing to assemble into unsafe --{kind}-dir: {upload_dir} "
+            f"(would alias contents inside {release_dir})"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Filesystem helpers — idempotent file / dir replacement
+# ---------------------------------------------------------------------------
+
+
+def replace_file(src: Path, dst: Path) -> None:
+    """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``."""
+
+    if dst.is_symlink() or dst.is_file():
+        dst.unlink()
+    elif dst.exists() and dst.is_dir():
+        shutil.rmtree(dst)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src, dst)
+
+
+def replace_dir(src: Path, dst: Path) -> None:
+    """Copy directory ``src`` → ``dst``, replacing any existing entry."""
+
+    if dst.is_symlink() or dst.is_file():
+        dst.unlink()
+    elif dst.exists() and dst.is_dir():
+        shutil.rmtree(dst)
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copytree(src, dst)
+
+
+# ---------------------------------------------------------------------------
+# Manifest reading
+# ---------------------------------------------------------------------------
+
+
+def load_manifest(path: Path) -> dict[str, Any]:
+    """Read a tier's ``manifest.json`` and return the parsed dict.
+
+    Raises ``ValueError`` when the JSON parses to anything other than a
+    top-level object (the packagers index into it expecting a dict).
+    """
+
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError(f"manifest.json at {path} is not a JSON object")
+    return payload
diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py
index 17a72db..382bed5 100644
--- a/scripts/package_hf_release.py
+++ b/scripts/package_hf_release.py
@@ -4,8 +4,8 @@
 PR 5.2 — second of two PRs in Phase 5 (Platform packaging) of the v1
 release roadmap.  This script:
 
-1. Reads each public tier's ``manifest.json`` + flat CSV header under
-   ``release/`` and assembles a Hugging Face dataset card
+1. Reads each public tier's ``manifest.json`` under ``release/`` and
+   assembles a Hugging Face dataset card
    (``release/huggingface/README.md``) whose YAML frontmatter satisfies
    G12.1 of ``docs/release/v1_acceptance_gates.md``: ``pretty_name``,
    ``license: mit``, ``language: en``, ``task_categories:
@@ -26,38 +26,46 @@
    ``leadforge-lead-scoring-v1-instructor`` companion repo (G12.4)
    from ``release/intermediate_instructor/`` into a separate
    ``release/huggingface-instructor/`` tree with one config
-   (``intermediate``).  Same code path; different defaults.
+   (``intermediate``).  The instructor variant ships its OWN dataset
+   card body (``INSTRUCTOR_BODY``) — inlining the public README would
+   leak three-tier prose into a single-tier dataset.
 
 The actual ``huggingface_hub.HfApi().upload_folder()`` call lives in
 PR 7.2; this script is intentionally publish-free.  ``--dry-run``
 writes the README only and skips upload-tree assembly, useful for
 README iteration.
 
-Failed validation exits with rc=1; pre-flight errors (missing release
-dir, missing tier, unsafe ``--huggingface-dir``) exit with rc=2.
+Validation discipline: ``run_packager`` runs every check first and
+returns ``errors`` WITHOUT writing any artifact when validation fails.
+The CLI converts ``errors`` into rc=1.  Pre-flight problems (missing
+release dir, unsafe ``--huggingface-dir``) raise and exit with rc=2
+without touching disk.
 """
 
 from __future__ import annotations
 
 import argparse
-import json
-import shutil
 import sys
 from collections.abc import Sequence
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Final
 
+import yaml
+
 # Make ``scripts/`` importable regardless of how this file is loaded.
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 
-# ``GITHUB_BLOB_BASE`` is re-exported for tests and downstream callers
-# (mirrors the symbol surface of ``scripts/package_kaggle_release.py``).
 from _release_common import (  # noqa: E402,F401 — must follow sys.path insert
     GITHUB_BLOB_BASE,
+    SOURCE_TREE_BLOCK,
     ValidationError,
+    load_manifest,
+    replace_dir,
+    replace_file,
     rewrite_release_links,
     validate_cover_image,
+    validate_readme_substitution,
     validate_upload_dir_safe,
 )
 
@@ -66,17 +74,18 @@
 # https://huggingface.co/docs/hub/datasets-cards)
 # ---------------------------------------------------------------------------
 
-#: Allowed ``size_categories`` tokens (HF taxonomy).  Each public tier
-#: has 5,000 leads (3500 train / 750 valid / 750 test) so ``1K<n<10K``
-#: is the right bucket; constants are kept here so ``--variant=instructor``
-#: can pick the right bucket too if the tier sizes ever change.
+#: Allowed ``size_categories`` token (HF taxonomy).  Each public tier
+#: ships task splits of 3500 / 750 / 750 rows; HF computes the size
+#: bucket from ``data_files``, so the largest split (3.5K) lands the
+#: dataset in the ``1K<n<10K`` bucket.  Pinned manually here rather
+#: than derived from the manifest because (a) the bucket is stable
+#: across the v1 family and (b) deriving from manifest row counts adds
+#: I/O and a bucket-mapping helper for no real benefit at v1 scale.
 HF_SIZE_BUCKET_5K: Final[str] = "1K<n<10K"
 
 #: Public-tier configs land under ``intro``/``intermediate``/``advanced``;
 #: ``intermediate`` is the recommended entry point per G12.2 (the
-#: difficulty band sits between the two extremes).  Picking
-#: ``intermediate`` as default mirrors ``release/HF_DATASET_CARD.md``
-#: (the legacy stub PR 5.2 supersedes).
+#: difficulty band sits between the two extremes).
 DEFAULT_DEFAULT_CONFIG: Final[str] = "intermediate"
 
 #: Allowed HF dataset-card ``task_categories`` token for tabular
@@ -87,10 +96,6 @@
 # Release-specific defaults
 # ---------------------------------------------------------------------------
 
-DEFAULT_OWNER: Final[str] = "leadforge"
-DEFAULT_DATASET_SLUG: Final[str] = "leadforge-lead-scoring-v1"
-DEFAULT_INSTRUCTOR_DATASET_SLUG: Final[str] = "leadforge-lead-scoring-v1-instructor"
-
 DEFAULT_PRETTY_NAME: Final[str] = "LeadForge: Synthetic B2B Lead Scoring (v1)"
 DEFAULT_INSTRUCTOR_PRETTY_NAME: Final[str] = (
     "LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion"
@@ -112,12 +117,10 @@
 DEFAULT_LANGUAGE: Final[str] = "en"
 
 DEFAULT_PUBLIC_TIERS: Final[tuple[str, ...]] = ("intro", "intermediate", "advanced")
-DEFAULT_INSTRUCTOR_TIERS: Final[tuple[str, ...]] = ("intermediate_instructor",)
 
-#: HF expects the validation split to be called ``validation`` (HF
-#: convention; ``valid`` would still load but the viewer labels it
-#: differently).  The committed parquet is ``valid.parquet``; the
-#: ``configs[*].data_files`` mapping bridges the two.
+#: HF expects the validation split to be called ``validation``.  The
+#: committed parquet is ``valid.parquet``; the ``configs[*].data_files``
+#: mapping bridges the two.
 DEFAULT_TASK: Final[str] = "converted_within_90_days"
 HF_SPLIT_NAMES: Final[tuple[tuple[str, str], ...]] = (
     ("train", "train"),
@@ -130,32 +133,10 @@
 DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR: Final[Path] = Path("release/huggingface-instructor")
 DEFAULT_COVER_IMAGE: Final[Path] = Path("release/dataset-cover-image.png")
 
-#: HF dataset cards do not require a cover image, but the field is
-#: harmless and keeps the published thumbnail aligned with Kaggle's.
-COVER_IMAGE_FILENAME: Final[str] = "dataset-cover-image.png"
-
 # ---------------------------------------------------------------------------
-# README rewriting — HF-specific tree block
+# README rewriting — HF-specific tree blocks
 # ---------------------------------------------------------------------------
 
-#: Source-repo tree diagram from ``release/README.md`` — must match the
-#: PR 4.1 README byte-for-byte; the silent-failure guard
-#: (:func:`_validate_readme_substitution`) fires when the two drift
-#: apart (mirrors PR 5.1's ``KAGGLE_TREE_BLOCK`` validator).
-HF_TREE_BLOCK_SOURCE: Final[str] = """```
-release/
-├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
-│   ├── manifest.json                 # provenance + file hashes
-│   ├── dataset_card.md               # auto-rendered per-bundle card
-│   ├── feature_dictionary.csv        # authoritative column spec
-│   ├── lead_scoring.csv              # flat convenience CSV (all splits)
-│   ├── tables/*.parquet              # 7 snapshot-safe relational tables
-│   └── tasks/converted_within_90_days/{train,valid,test}.parquet
-├── intermediate_instructor/          # research companion: full-horizon tables + metadata/
-├── notebooks/01_baseline_lead_scoring.ipynb
-└── validation/                       # validation_report.{json,md} + figures
-```"""
-
 HF_UPLOAD_TREE_BLOCK: Final[str] = """```
 .
 ├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
@@ -176,7 +157,7 @@
 │   ├── manifest.json                 # provenance + file hashes
 │   ├── dataset_card.md               # auto-rendered per-bundle card
 │   ├── feature_dictionary.csv        # authoritative column spec
-│   ├── tables/*.parquet              # 9 full-horizon tables (incl. customers, subscriptions)
+│   ├── tables/*.parquet              # full-horizon tables (incl. customers, subscriptions)
 │   ├── tasks/converted_within_90_days/{train,valid,test}.parquet
 │   └── metadata/                     # world_spec, graph.{graphml,json}, latent_registry, etc.
 ├── README.md                         # this file (HF dataset card)
@@ -185,53 +166,159 @@
 ```"""
 
 
-def _hf_readme_text(readme: str, *, tree_block: str = HF_UPLOAD_TREE_BLOCK) -> str:
-    """Apply the HF-specific rewrites to a copy of the release README.
-
-    Rewrites:
+def _hf_public_readme_text(readme: str) -> str:
+    """Rewrite the public release README into the HF public dataset card.
 
-    1. Source-repo tree diagram → upload-tree diagram (the published
-       README should describe what the *user* sees on HF, not the
-       source repo layout).
-    2. ``](../foo)`` → ``](GITHUB_BLOB_BASE/foo)`` and
-       ``](validation/...)`` → blob URL — both via
-       ``rewrite_release_links`` from ``_release_common``.
-
-    The instructor variant calls this with
-    ``tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK``; otherwise the
-    public upload tree is used.
+    1. Source-repo tree diagram → public upload-tree diagram.
+    2. ``](../foo)`` and ``](validation/...)`` → GitHub blob URLs
+       (delegated to ``rewrite_release_links``).
     """
 
-    text = readme.replace(HF_TREE_BLOCK_SOURCE, tree_block)
-    text = rewrite_release_links(text)
-    return text
-
+    text = readme.replace(SOURCE_TREE_BLOCK, HF_UPLOAD_TREE_BLOCK)
+    return rewrite_release_links(text)
 
-def _validate_readme_substitution(release_dir: Path) -> list[ValidationError]:
-    """Guard against silent drift between the README's tree diagram and
-    ``HF_TREE_BLOCK_SOURCE`` (mirrors PR 5.1's Kaggle-side guard).
 
-    Plain string ``replace()`` silently no-ops when the substring is
-    not found, which would publish the source-repo tree diagram on HF.
-    """
+# ---------------------------------------------------------------------------
+# Instructor-companion body
+#
+# Inlining the public README into the instructor card published 3-tier
+# prose for a 1-tier dataset (PR 5.2 self-review #2).  The instructor
+# package serves a different audience — researchers who already read
+# the public dataset card — so it ships a focused, self-contained body
+# that links to the public dataset for shared framing.
+#
+# The constant is the WHOLE markdown body (no YAML; that comes from the
+# generated frontmatter).  It is byte-stable; the audit-artifact-sync
+# test catches any drift against the committed instructor README.
+# ---------------------------------------------------------------------------
 
-    readme = release_dir / "README.md"
-    if not readme.exists():
-        return []
-    if HF_TREE_BLOCK_SOURCE not in readme.read_text(encoding="utf-8"):
-        return [
-            ValidationError(
-                field="release/README.md",
-                message=(
-                    "HF_TREE_BLOCK_SOURCE not found verbatim in release/README.md; "
-                    "the source-repo tree diagram in the README has drifted from "
-                    "the constant in scripts/package_hf_release.py — the "
-                    "HF README rewrite will silently no-op until the README and "
-                    "HF_TREE_BLOCK_SOURCE are reconciled."
-                ),
-            )
-        ]
-    return []
+INSTRUCTOR_BODY: Final[str] = f"""\
+# LeadForge: Synthetic B2B Lead Scoring (v1) — Instructor companion
+
+This is the **research / instructor companion** to the public
+[`leadforge/leadforge-lead-scoring-v1`](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1)
+dataset.  It exposes the **full-horizon** view of a single difficulty
+tier (`intermediate`) plus the **hidden causal structure** that the
+public dataset deliberately redacts: the world graph (DAG), latent
+trait registry, mechanism summary, and full-horizon relational tables
+including `customers` and `subscriptions`.
+
+It exists for instructors who want to walk students through how the
+public dataset was generated, and for researchers who want to verify
+that the public redactions actually remove the leakage paths the
+dataset advertises.  **It is not a replacement for the public dataset
+in any teaching or modelling context** — students should still train
+on the public bundle.
+
+## What this companion contains
+
+{HF_INSTRUCTOR_UPLOAD_TREE_BLOCK}
+
+The single ``intermediate`` config exposes the same train/valid/test
+parquet splits as the public dataset's ``intermediate`` config — same
+seeds, same row counts (3,500 / 750 / 750), same target.  The
+difference lives in the relational tables and metadata:
+
+| File | Public `intermediate` | Instructor companion |
+|---|---|---|
+| `tables/leads.parquet` | redacted (label dropped) | full (label retained) |
+| `tables/opportunities.parquet` | snapshot-filtered + redacted | full-horizon, full columns |
+| `tables/customers.parquet` | omitted (would leak label) | included |
+| `tables/subscriptions.parquet` | omitted (would leak label) | included |
+| `tables/touches.parquet` etc. | filtered to ≤ snapshot day | full 90-day horizon |
+| `metadata/world_spec.json` | absent | included (DGP + recipe) |
+| `metadata/graph.{{graphml,json}}` | absent | included (hidden DAG) |
+| `metadata/latent_registry.json` | absent | included (latent traits) |
+| `metadata/mechanism_summary.json` | absent | included (per-edge mechanisms) |
+
+The redaction contract is single-sourced in
+[`leadforge/validation/leakage_probes.py`]({GITHUB_BLOB_BASE}/leadforge/validation/leakage_probes.py)
+and re-applied by
+[`leadforge/render/relational_snapshot_safe.py`]({GITHUB_BLOB_BASE}/leadforge/render/relational_snapshot_safe.py)
+when the public bundle is built; this companion is the unfiltered
+source view, so the two are always consistent by construction.
+
+## Quick start
+
+```python
+from datasets import load_dataset
+
+# Loads the same train/valid/test splits as the public 'intermediate'
+# config; differs only in what `tables/` and `metadata/` provide.
+ds = load_dataset(
+    "leadforge/leadforge-lead-scoring-v1-instructor",
+    name="intermediate",
+)
+train = ds["train"].to_pandas()
+
+# Full-horizon relational tables — includes customers and subscriptions
+# (omitted from the public dataset because their existence reconstructs
+# the conversion label).
+import pandas as pd
+customers = pd.read_parquet(
+    "hf://datasets/leadforge/leadforge-lead-scoring-v1-instructor/intermediate/tables/customers.parquet"
+)
+```
+
+## Intended uses
+
+- Teaching the **public-vs-instructor split** itself: load both
+  datasets side-by-side, show students which columns and tables were
+  redacted, and walk through why each was a leakage path.
+- **Verifying the redaction contract:** train a model on the
+  full-horizon tables, train another on the snapshot-safe public
+  tables, compare AUC.  The gap is the redaction's effect.
+- Teaching **causal structure and DGP transparency** using
+  `metadata/world_spec.json` + `metadata/graph.json`.
+- Reproducing the public dataset from the instructor view via
+  [`leadforge`]({GITHUB_BLOB_BASE}) source code.
+
+## Out-of-scope uses
+
+- **Production lead scoring.**  Same as the public dataset; the
+  company, product, and customers are fictional.
+- **Modelling with the unredacted view as a baseline.**  Models
+  trained against the full-horizon tables look strong because they're
+  directly seeing post-conversion events.  That number is not a
+  baseline; it's the ceiling.
+- **Demographic / fairness research.**  v1 does not model protected
+  attributes.
+
+## Composition
+
+- **Entities.**  9 relational tables (accounts, contacts, leads,
+  touches, sessions, sales_activities, opportunities, customers,
+  subscriptions); per-row counts in `manifest.json`.
+- **Splits.**  Identical to the public `intermediate` config: 70/15/15
+  train/valid/test, deterministic given seed 42, recorded in
+  `tasks/converted_within_90_days/task_manifest.json`.
+- **Provenance.**  Recipe `b2b_saas_procurement_v1`, seed 42, package
+  version stamped in `manifest.json` along with SHA-256 hashes for
+  every parquet file.
+- **Bundle schema version.**  5 (matches the public dataset).
+
+## Maintenance, license
+
+We *want* the dataset to be broken.  See the
+[public dataset card](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1)
+for the adversarial-framing pointers, the issue templates, and the
+break-me guide.  File issues at
+[leadforge-dev/leadforge](https://github.com/leadforge-dev/leadforge);
+PRs welcome.
+
+| Field | Value |
+|---|---|
+| Generator | leadforge `1.0.0+` |
+| Recipe | `b2b_saas_procurement_v1` |
+| Canonical seed | 42 |
+| Bundle schema version | 5 |
+| Format | Parquet (canonical) |
+| License | MIT — see [LICENSE](LICENSE) |
+| Public dataset | [link](https://huggingface.co/datasets/leadforge/leadforge-lead-scoring-v1) |
+
+Verify integrity with `leadforge validate <bundle_dir>`; every file is
+hashed in `manifest.json`.
+"""
 
 
 # ---------------------------------------------------------------------------
@@ -264,9 +351,9 @@ class ConfigEntry:
 class HuggingFaceCard:
     """Top-level HF dataset-card payload.
 
-    ``body`` is the markdown that follows the YAML frontmatter — by
-    default the rewritten ``release/README.md`` (G12.1 expects the
-    README to BE the dataset card; the YAML lives at the top).
+    ``body`` is the markdown that follows the YAML frontmatter — the
+    rewritten public README for the public variant, the
+    ``INSTRUCTOR_BODY`` constant for the instructor variant.
     """
 
     pretty_name: str
@@ -369,51 +456,27 @@ def validate_card(card: HuggingFaceCard) -> list[ValidationError]:
 
 
 # ---------------------------------------------------------------------------
-# Bundle reading + config building
+# Card construction
 # ---------------------------------------------------------------------------
 
 
-def _load_manifest(path: Path) -> dict[str, Any]:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-    if not isinstance(payload, dict):
-        raise ValueError(f"manifest.json at {path} is not a JSON object")
-    return payload
-
-
-def build_config_for_tier(
-    release_dir: Path,
+def _config_for_tier(
     *,
-    tier_dir: str,
     config_name: str,
     task: str = DEFAULT_TASK,
     default: bool = False,
 ) -> ConfigEntry:
-    """Build a single ``configs[]`` entry for one tier.
-
-    ``tier_dir`` is the directory name under the upload tree
-    (``intro`` / ``intermediate`` / ``advanced`` for the public
-    variant; ``intermediate`` for the instructor companion which
-    flattens ``intermediate_instructor/`` → ``intermediate/`` in the
-    upload tree).  ``config_name`` is what users pass to
-    ``load_dataset(..., name=...)``.
-
-    The function reads ``manifest.json`` to confirm the tier exists
-    and the task splits are declared; it does NOT fail when files are
-    missing on disk because the upload tree is reassembled by
-    :func:`assemble_upload_dir` and the manifest is the single source
-    of truth for what should be there.
+    """Pure constructor — no I/O.
+
+    Builds a ``ConfigEntry`` from the canonical task-split layout.
+    The earlier draft also opened the manifest to check the task was
+    declared, but that's I/O for a structural check that
+    :func:`assemble_upload_dir` already enforces (the parquet copies
+    fail loud if the source paths are missing).  Keeping it pure makes
+    card construction work without a fully-populated release dir,
+    which is what tests want.
     """
 
-    manifest_path = release_dir / tier_dir / "manifest.json"
-    if not manifest_path.exists():
-        raise FileNotFoundError(f"manifest.json missing for tier {tier_dir!r}: {manifest_path}")
-    manifest = _load_manifest(manifest_path)
-    if task not in manifest.get("tasks", {}):
-        raise ValueError(
-            f"task {task!r} not declared in manifest at {manifest_path}; "
-            f"got {sorted(manifest.get('tasks', {}))!r}"
-        )
-
     data_files = tuple(
         DataFileEntry(
             split=hf_split,
@@ -424,6 +487,20 @@ def build_config_for_tier(
     return ConfigEntry(config_name=config_name, data_files=data_files, default=default)
 
 
+def _assert_tier_dir_exists(release_dir: Path, tier_dir: str) -> None:
+    """Cheap preflight: tier dir must contain a ``manifest.json``.
+
+    Catches user typos at ``--dry-run`` time rather than waiting for
+    ``shutil.copytree`` to fail during assembly.  No JSON parse — we
+    only care that the tier exists; the schema is the assembler's
+    concern.
+    """
+
+    manifest = release_dir / tier_dir / "manifest.json"
+    if not manifest.exists():
+        raise FileNotFoundError(f"manifest.json missing for tier {tier_dir!r}: {manifest}")
+
+
 def build_card(
     release_dir: Path,
     *,
@@ -441,62 +518,40 @@ def build_card(
     """Assemble a ``HuggingFaceCard`` from the release tree.
 
     ``variant="public"`` builds the three-tier card pointing at
-    ``release/{intro,intermediate,advanced}/``;
+    ``release/{intro,intermediate,advanced}/`` and uses the rewritten
+    public README as the body.
+
     ``variant="instructor"`` builds a single-config card pointing at
-    ``release/intermediate_instructor/`` (flattened to
-    ``intermediate/`` in the upload tree).
-
-    When ``body`` is ``None`` (the default) we lift the contents of
-    ``release/README.md`` and apply the HF-specific rewrites — HF
-    renders the body as the dataset card so a full README there is
-    more useful than a curated blurb (mirrors the Kaggle packager's
-    description handling).
+    ``release/intermediate_instructor/`` (flattened to ``intermediate/``
+    in the upload tree) and uses :data:`INSTRUCTOR_BODY` as the body.
+    The instructor variant gets a focused body rather than the public
+    README to avoid publishing 3-tier prose for a 1-tier dataset.
     """
 
     if variant == "public":
-        public_tiers = (
-            ("intro", "intro"),
-            ("intermediate", "intermediate"),
-            ("advanced", "advanced"),
-        )
+        for tier in DEFAULT_PUBLIC_TIERS:
+            _assert_tier_dir_exists(release_dir, tier)
         configs = tuple(
-            build_config_for_tier(
-                release_dir,
-                tier_dir=tier_dir,
-                config_name=config_name,
+            _config_for_tier(
+                config_name=tier,
                 task=task,
-                default=(config_name == default_config),
+                default=(tier == default_config),
             )
-            for tier_dir, config_name in public_tiers
+            for tier in DEFAULT_PUBLIC_TIERS
         )
         if pretty_name is None:
             pretty_name = DEFAULT_PRETTY_NAME
         if body is None:
-            body = _hf_readme_text(
+            body = _hf_public_readme_text(
                 (release_dir / "README.md").read_text(encoding="utf-8"),
-                tree_block=HF_UPLOAD_TREE_BLOCK,
             )
     elif variant == "instructor":
-        # Companion repo flattens ``intermediate_instructor`` →
-        # ``intermediate`` in the upload tree so the HF dataset slug
-        # ``leadforge-lead-scoring-v1-instructor`` exposes the
-        # familiar config name.
-        configs = (
-            build_config_for_tier(
-                release_dir,
-                tier_dir="intermediate_instructor",
-                config_name="intermediate",
-                task=task,
-                default=True,
-            ),
-        )
+        _assert_tier_dir_exists(release_dir, "intermediate_instructor")
+        configs = (_config_for_tier(config_name="intermediate", task=task, default=True),)
         if pretty_name is None:
             pretty_name = DEFAULT_INSTRUCTOR_PRETTY_NAME
         if body is None:
-            body = _hf_readme_text(
-                (release_dir / "README.md").read_text(encoding="utf-8"),
-                tree_block=HF_INSTRUCTOR_UPLOAD_TREE_BLOCK,
-            )
+            body = INSTRUCTOR_BODY
     else:
         raise ValueError(f"unknown variant: {variant!r} (expected 'public' or 'instructor')")
 
@@ -515,74 +570,71 @@ def build_card(
 # ---------------------------------------------------------------------------
 # Rendering
 #
-# YAML is hand-rolled rather than dumped by PyYAML because the HF
-# dataset-card YAML has a precise key order and indentation style that
-# the viewer is fussy about (and PyYAML's default flow style collapses
-# the configs list into a single line).  Hand-rolling also makes the
-# determinism contract obvious in the source.
-# ---------------------------------------------------------------------------
+# YAML serialization uses ``yaml.safe_dump`` with a custom dumper that
+# forces indent-2 on top-level sequences (HF dataset-card examples in
+# the wild use this style; PyYAML's default puts top-level list items
+# at indent 0).  The earlier draft hand-rolled the renderer with a
+# brittle quoting heuristic; PR 5.2's self-review caught the hack and
+# replaced it with PyYAML.
 # ---------------------------------------------------------------------------
 
 
-def _yaml_string(value: str) -> str:
-    """Render a YAML string scalar.
+class _IndentedDumper(yaml.SafeDumper):
+    """Force indent-2 on top-level sequences.
 
-    HF parses the frontmatter with PyYAML.  We use double-quoted
-    strings only when the value contains characters that would change
-    YAML semantics (``:``, leading ``- ``, leading whitespace).  Plain
-    scalars are used otherwise so the YAML stays readable.
+    PyYAML's default ``increase_indent`` returns ``indentless=True``
+    when emitting a top-level block sequence, putting ``- item`` at
+    column 0.  HF dataset cards conventionally indent these by 2 (per
+    the examples in the HF docs), so we override the flag to ``False``.
     """
 
-    if value == "":
-        return '""'
-    needs_quoting = any(c in value for c in ":#&*!|>'\"%@`")
-    needs_quoting = needs_quoting or value.startswith(("- ", " ", "?", "[", "{", "*", "&"))
-    needs_quoting = needs_quoting or value.endswith(" ")
-    if not needs_quoting:
-        return value
-    escaped = value.replace("\\", "\\\\").replace('"', '\\"')
-    return f'"{escaped}"'
+    def increase_indent(self, flow: bool = False, indentless: bool = False) -> Any:  # noqa: ARG002
+        return super().increase_indent(flow, False)
+
 
+def _frontmatter_dict(card: HuggingFaceCard) -> dict[str, Any]:
+    """Build the dict serialised into YAML.
 
-def _yaml_list(values: Sequence[str], *, indent: int) -> list[str]:
-    """Render a YAML block-style list."""
+    Tags are sorted at render time (mirrors the keyword sort in the
+    Kaggle packager) so dataclass order doesn't leak into the rendered
+    file.  The dict is built in field order so PyYAML preserves it
+    (with ``sort_keys=False``).
+    """
 
-    pad = " " * indent
-    return [f"{pad}- {_yaml_string(v)}" for v in values]
+    return {
+        "pretty_name": card.pretty_name,
+        "license": card.license,
+        "language": list(card.language),
+        "task_categories": list(card.task_categories),
+        "size_categories": list(card.size_categories),
+        "tags": sorted(card.tags),
+        "configs": [
+            {
+                "config_name": config.config_name,
+                **({"default": True} if config.default else {}),
+                "data_files": [{"split": df.split, "path": df.path} for df in config.data_files],
+            }
+            for config in card.configs
+        ],
+    }
 
 
 def render_yaml_frontmatter(card: HuggingFaceCard) -> str:
     """Render the YAML frontmatter as a deterministic string.
 
-    Order: ``pretty_name``, ``license``, ``language``,
-    ``task_categories``, ``size_categories``, ``tags``, ``configs``.
-    Tags are sorted at render time (mirrors the keyword sort in the
-    Kaggle packager) so order on the dataclass doesn't leak into the
-    rendered file.
+    The leading and trailing ``---`` markers are included so callers
+    can concatenate this directly to the body.
     """
 
-    lines: list[str] = ["---"]
-    lines.append(f"pretty_name: {_yaml_string(card.pretty_name)}")
-    lines.append(f"license: {_yaml_string(card.license)}")
-    lines.append("language:")
-    lines.extend(_yaml_list(card.language, indent=2))
-    lines.append("task_categories:")
-    lines.extend(_yaml_list(card.task_categories, indent=2))
-    lines.append("size_categories:")
-    lines.extend(_yaml_list(card.size_categories, indent=2))
-    lines.append("tags:")
-    lines.extend(_yaml_list(sorted(card.tags), indent=2))
-    lines.append("configs:")
-    for config in card.configs:
-        lines.append(f"  - config_name: {_yaml_string(config.config_name)}")
-        if config.default:
-            lines.append("    default: true")
-        lines.append("    data_files:")
-        for df in config.data_files:
-            lines.append(f"      - split: {_yaml_string(df.split)}")
-            lines.append(f"        path: {_yaml_string(df.path)}")
-    lines.append("---")
-    return "\n".join(lines) + "\n"
+    payload = yaml.dump(
+        _frontmatter_dict(card),
+        Dumper=_IndentedDumper,
+        default_flow_style=False,
+        sort_keys=False,
+        allow_unicode=True,
+        width=200,
+    )
+    return f"---\n{payload}---\n"
 
 
 def render_card(card: HuggingFaceCard) -> str:
@@ -599,41 +651,23 @@ def render_card(card: HuggingFaceCard) -> str:
 # ---------------------------------------------------------------------------
 
 
-def _replace_file(src: Path, dst: Path) -> None:
-    """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``."""
-
-    if dst.is_symlink() or dst.is_file():
-        dst.unlink()
-    elif dst.exists() and dst.is_dir():
-        shutil.rmtree(dst)
-    dst.parent.mkdir(parents=True, exist_ok=True)
-    shutil.copy2(src, dst)
-
-
-def _replace_dir(src: Path, dst: Path) -> None:
-    """Copy directory ``src`` → ``dst``, replacing any existing entry."""
-
-    if dst.is_symlink() or dst.is_file():
-        dst.unlink()
-    elif dst.exists() and dst.is_dir():
-        shutil.rmtree(dst)
-    dst.parent.mkdir(parents=True, exist_ok=True)
-    shutil.copytree(src, dst)
-
-
 def assemble_upload_dir(
     release_dir: Path,
     upload_dir: Path,
     *,
     variant: str = "public",
+    rendered_readme: str,
     cover_image: Path = DEFAULT_COVER_IMAGE,
 ) -> None:
     """Assemble ``upload_dir`` for ``huggingface_hub.upload_folder()``.
 
-    Mirrors PR 5.1's Kaggle assembler: real-file copies of the per-
-    tier bundles + cover image + LICENSE + the rewritten README.  No
-    symlinks (the ``datasets`` library walks the upload tree and silent
-    skips broken symlinks in some versions).
+    Writes the README, copies the cover image and LICENSE, and copies
+    each tier bundle as a real-file copy (no symlinks; ``datasets``
+    walks the tree and silently skips broken symlinks in some
+    versions).  ``rendered_readme`` is the already-validated and
+    rendered card text — passing it in (rather than reading it back
+    from ``run_packager``) means this function produces a complete
+    upload tree on its own.
 
     For ``variant="instructor"``, the source directory
     ``intermediate_instructor/`` is flattened to ``intermediate/`` in
@@ -645,25 +679,31 @@ def assemble_upload_dir(
     validate_upload_dir_safe(upload_dir, release_dir, kind=kind)
     upload_dir.mkdir(parents=True, exist_ok=True)
 
+    # README — written from the validated card text.
+    readme_path = upload_dir / "README.md"
+    if readme_path.is_symlink() or readme_path.is_file():
+        readme_path.unlink()
+    readme_path.write_text(rendered_readme, encoding="utf-8")
+
     # Cover image — reuse the one ``release/`` already ships.
     cover_src = release_dir / cover_image.name
     if not cover_src.exists():
         cover_src = cover_image
     if cover_src.exists():
-        _replace_file(cover_src, upload_dir / COVER_IMAGE_FILENAME)
+        replace_file(cover_src, upload_dir / cover_image.name)
 
     # LICENSE — straight copy, no rewriting.
     license_src = release_dir / "LICENSE"
     if license_src.exists():
-        _replace_file(license_src, upload_dir / "LICENSE")
+        replace_file(license_src, upload_dir / "LICENSE")
 
     # Per-tier bundles — full directory copies.  The instructor variant
     # flattens its source dir name.
     if variant == "public":
         for tier in DEFAULT_PUBLIC_TIERS:
-            _replace_dir(release_dir / tier, upload_dir / tier)
+            replace_dir(release_dir / tier, upload_dir / tier)
     elif variant == "instructor":
-        _replace_dir(
+        replace_dir(
             release_dir / "intermediate_instructor",
             upload_dir / "intermediate",
         )
@@ -699,6 +739,11 @@ def run_packager(
 ) -> PackagerOutcome:
     """Build, validate, and write the HF dataset card.
 
+    Validation discipline: build → validate → ONLY THEN write.  If
+    validation fails the function returns an outcome with populated
+    ``errors`` and **no artifact on disk**.  The CLI converts errors
+    to rc=1.
+
     With ``dry_run=False`` (the default) the packager additionally
     assembles the HF-loadable upload directory under
     ``huggingface_dir`` (real-file copies of the per-tier bundles +
@@ -720,24 +765,43 @@ def run_packager(
     errors: list[ValidationError] = []
     errors.extend(validate_card(card))
     errors.extend(validate_cover_image(cover_image))
-    errors.extend(_validate_readme_substitution(release_dir))
+    if variant == "public":
+        # The instructor body doesn't substitute SOURCE_TREE_BLOCK —
+        # it's a self-contained markdown — so the substitution guard
+        # only applies to the public variant.
+        errors.extend(validate_readme_substitution(release_dir, packager_name="HF"))
 
     readme_path = huggingface_dir / "README.md"
-    readme_path.parent.mkdir(parents=True, exist_ok=True)
-    readme_path.write_text(render_card(card), encoding="utf-8")
 
-    if not dry_run:
+    # Validation gate: don't leave broken artifacts on disk.
+    if errors:
+        return PackagerOutcome(
+            card=card,
+            readme_path=readme_path,
+            errors=tuple(errors),
+            assembled=False,
+        )
+
+    rendered = render_card(card)
+    huggingface_dir.mkdir(parents=True, exist_ok=True)
+
+    if dry_run:
+        # README-only — fast iteration path.  Write directly, don't
+        # invoke the assembler.
+        readme_path.write_text(rendered, encoding="utf-8")
+    else:
         assemble_upload_dir(
             release_dir,
             huggingface_dir,
             variant=variant,
+            rendered_readme=rendered,
             cover_image=cover_image,
         )
 
     return PackagerOutcome(
         card=card,
         readme_path=readme_path,
-        errors=tuple(errors),
+        errors=(),
         assembled=not dry_run,
     )
 
@@ -778,25 +842,8 @@ def _parse_args(argv: Sequence[str] | None) -> argparse.Namespace:
         default=DEFAULT_DEFAULT_CONFIG,
         help=(
             "config that carries default: true (G12.2; "
-            "default: %(default)s for public; ignored for instructor)"
-        ),
-    )
-    parser.add_argument(
-        "--owner",
-        default=DEFAULT_OWNER,
-        help=(
-            "HF dataset owner — currently informational; PR 7.2 will "
-            "consume it via ``huggingface_hub.HfApi().upload_folder``. "
-            "default: %(default)s"
-        ),
-    )
-    parser.add_argument(
-        "--dataset-slug",
-        default=None,
-        help=(
-            "HF dataset slug — currently informational; PR 7.2 will "
-            "consume it. defaults: leadforge-lead-scoring-v1 (public) / "
-            "leadforge-lead-scoring-v1-instructor (instructor)"
+            "default: %(default)s; rejected for --variant=instructor "
+            "which has only one config)"
         ),
     )
     parser.add_argument(
@@ -820,6 +867,16 @@ def main(argv: Sequence[str] | None = None) -> int:
         DEFAULT_HUGGINGFACE_DIR if variant == "public" else DEFAULT_HUGGINGFACE_INSTRUCTOR_DIR
     )
 
+    # Reject silent misconfiguration — instructor variant has one
+    # config and ``--default-config`` is meaningless for it.
+    if variant == "instructor" and args.default_config != DEFAULT_DEFAULT_CONFIG:
+        print(
+            f"error: --default-config={args.default_config!r} is not valid with "
+            f"--variant=instructor (instructor has a single 'intermediate' config)",
+            file=sys.stderr,
+        )
+        return 2
+
     try:
         outcome = run_packager(
             args.release_dir,
diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py
index b8221c0..c83d5c9 100644
--- a/scripts/package_kaggle_release.py
+++ b/scripts/package_kaggle_release.py
@@ -43,7 +43,6 @@
 import argparse
 import json
 import re
-import shutil
 import sys
 from collections.abc import Sequence
 from dataclasses import dataclass, field
@@ -59,24 +58,25 @@
 # ``importlib.util.spec_from_file_location`` from the test suite).
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 
-# Shared release-packaging primitives extracted in PR 5.2 so the HF
-# packager can reuse the link-rewriter, the dir-safety guard, and the
-# cover-image validator without duplicating logic.
-from _release_common import (  # noqa: E402 — must follow sys.path insert
+# Shared release-packaging primitives.  PR 5.2 extracted these so the
+# Kaggle and HF packagers share one source of truth for link rewriting,
+# upload-dir safety, cover-image validation, source-tree-block drift
+# detection, idempotent file/dir replacement, and manifest reading.
+# ``GITHUB_BLOB_BASE`` is re-exported for tests asserting blob URLs in
+# rewritten README content; its presence here is a public-symbol
+# contract, not a local consumer.
+from _release_common import (  # noqa: E402,F401 — must follow sys.path insert
     GITHUB_BLOB_BASE,
+    SOURCE_TREE_BLOCK,
     ValidationError,
+    load_manifest,
+    replace_dir,
+    replace_file,
+    rewrite_release_links,
     validate_cover_image,
+    validate_readme_substitution,
     validate_upload_dir_safe,
 )
-from _release_common import (
-    PARENT_RELATIVE_LINK_RE as _PARENT_RELATIVE_LINK,
-)
-from _release_common import (
-    VALIDATION_REPORT_LINK as _VALIDATION_REPORT_LINK,
-)
-from _release_common import (
-    VALIDATION_REPORT_URL as _VALIDATION_REPORT_URL,
-)
 
 # ---------------------------------------------------------------------------
 # Kaggle field constraints (chatgpt v2 §19, verified from official docs)
@@ -210,30 +210,14 @@ class UserSource:
 # ---------------------------------------------------------------------------
 #
 # ``GITHUB_BLOB_BASE``, the ``](../foo)`` regex, and the validation-
-# report link rewrite live in ``_release_common`` (PR 5.2). Tree-block
-# substitution stays local because the upload tree differs per
-# platform (Kaggle ships ``dataset-metadata.json`` + cover image at
-# the top; HF does not).
+# report link rewrite live in ``_release_common`` (PR 5.2), as does the
+# source-side tree-diagram constant (``SOURCE_TREE_BLOCK``).  Only the
+# Kaggle-specific upload-tree diagram lives here.
 
-#: The "What's inside" tree diagram in ``release/README.md``.  The
-#: published README on Kaggle should describe the *upload* layout
+#: The published README on Kaggle should describe the *upload* layout
 #: (which has dataset-metadata.json + cover image at the top, no
 #: instructor companion, no notebooks/validation siblings), not the
 #: source-repo layout — we substitute the block on the way out.
-KAGGLE_TREE_BLOCK: Final[str] = """```
-release/
-├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
-│   ├── manifest.json                 # provenance + file hashes
-│   ├── dataset_card.md               # auto-rendered per-bundle card
-│   ├── feature_dictionary.csv        # authoritative column spec
-│   ├── lead_scoring.csv              # flat convenience CSV (all splits)
-│   ├── tables/*.parquet              # 7 snapshot-safe relational tables
-│   └── tasks/converted_within_90_days/{train,valid,test}.parquet
-├── intermediate_instructor/          # research companion: full-horizon tables + metadata/
-├── notebooks/01_baseline_lead_scoring.ipynb
-└── validation/                       # validation_report.{json,md} + figures
-```"""
-
 KAGGLE_UPLOAD_TREE_BLOCK: Final[str] = """```
 .
 ├── intro/ intermediate/ advanced/    # student_public bundles, one per difficulty tier
@@ -255,20 +239,15 @@ def _kaggle_readme_text(readme: str) -> str:
 
     Rewrites:
 
-    1. Source-repo tree diagram → upload-tree diagram (the published
-       README should describe what the *user* sees on Kaggle, not the
-       source repo layout).
-    2. ``](../foo)`` → ``]({GITHUB_BLOB_BASE}/foo)`` (markdown links
-       that escape the bundle root resolve to the source repo on
-       GitHub).
-    3. ``](validation/validation_report.md)`` → blob URL (the
-       validation report does not ship to Kaggle; readers click
-       through to GitHub).
+    1. ``SOURCE_TREE_BLOCK`` → ``KAGGLE_UPLOAD_TREE_BLOCK`` (the
+       published README should describe what the *user* sees on
+       Kaggle, not the source repo layout).
+    2. ``](../foo)`` and ``](validation/validation_report.md)`` →
+       GitHub blob URLs (delegated to ``rewrite_release_links``).
     """
 
-    text = readme.replace(KAGGLE_TREE_BLOCK, KAGGLE_UPLOAD_TREE_BLOCK)
-    text = _PARENT_RELATIVE_LINK.sub(rf"]({GITHUB_BLOB_BASE}/\1)", text)
-    text = text.replace(_VALIDATION_REPORT_LINK, _VALIDATION_REPORT_URL)
+    text = readme.replace(SOURCE_TREE_BLOCK, KAGGLE_UPLOAD_TREE_BLOCK)
+    text = rewrite_release_links(text)
     return text
 
 
@@ -553,11 +532,7 @@ def fields_from_parquet(path: Path) -> tuple[FieldDescriptor, ...]:
     return tuple(FieldDescriptor(name=f.name, type=_kaggle_type_from_arrow(f.type)) for f in schema)
 
 
-def _load_manifest(path: Path) -> dict[str, Any]:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-    if not isinstance(payload, dict):
-        raise ValueError(f"manifest.json at {path} is not a JSON object")
-    return payload
+# ``_load_manifest`` is now ``load_manifest`` in ``_release_common``.
 
 
 def build_tier_resources(
@@ -583,7 +558,7 @@ def build_tier_resources(
     flat_csv_path = tier_dir / "lead_scoring.csv"
     fields = _flat_csv_fields(flat_csv_path, feature_dict)
 
-    manifest = _load_manifest(tier_dir / "manifest.json")
+    manifest = load_manifest(tier_dir / "manifest.json")
     table_inventory = manifest.get("tables", {})
     snapshot_day = manifest.get("snapshot_day")
 
@@ -785,38 +760,8 @@ def render_metadata_json(metadata: DatasetMetadata) -> str:
 # ---------------------------------------------------------------------------
 
 
-def _validate_kaggle_dir_safe(kaggle_dir: Path, release_dir: Path) -> None:
-    """Refuse to assemble into a path that aliases something dangerous.
-
-    Thin wrapper that forwards to ``_release_common.validate_upload_dir_safe``
-    with ``kind="kaggle"`` so the error message identifies which packager
-    refused.  The standalone function is kept for the test in
-    ``tests/scripts/test_package_kaggle_release.py`` that imports it directly.
-    """
-
-    validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle")
-
-
-def _replace_file(src: Path, dst: Path) -> None:
-    """Copy ``src`` → ``dst``, replacing any existing entry at ``dst``."""
-
-    if dst.is_symlink() or dst.is_file():
-        dst.unlink()
-    elif dst.exists() and dst.is_dir():
-        shutil.rmtree(dst)
-    dst.parent.mkdir(parents=True, exist_ok=True)
-    shutil.copy2(src, dst)
-
-
-def _replace_dir(src: Path, dst: Path) -> None:
-    """Copy directory ``src`` → ``dst``, replacing any existing entry."""
-
-    if dst.is_symlink() or dst.is_file():
-        dst.unlink()
-    elif dst.exists() and dst.is_dir():
-        shutil.rmtree(dst)
-    dst.parent.mkdir(parents=True, exist_ok=True)
-    shutil.copytree(src, dst)
+# ``_validate_kaggle_dir_safe``, ``_replace_file``, ``_replace_dir``
+# now live in ``_release_common`` (PR 5.2) under their public names.
 
 
 def assemble_upload_dir(
@@ -836,25 +781,26 @@ def assemble_upload_dir(
     children — switching to copies removes that fragility at the cost
     of ~15 MB of disk per assembly run, which is gitignored anyway.
 
-    Re-running the assembly is idempotent: ``_replace_file`` and
-    ``_replace_dir`` rmtree-then-copy any existing entry.  The README
-    is the one file rewritten on the way in (tree diagram + ``../``
-    links).  ``--dry-run`` skips this whole function.
+    Re-running the assembly is idempotent: ``replace_file`` and
+    ``replace_dir`` (from ``_release_common``) rmtree-then-copy any
+    existing entry.  The README is the one file rewritten on the way
+    in (tree diagram + ``../`` links).  ``--dry-run`` skips this
+    whole function.
     """
 
-    _validate_kaggle_dir_safe(kaggle_dir, release_dir)
+    validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle")
     kaggle_dir.mkdir(parents=True, exist_ok=True)
 
     # Cover image.
     cover_src = release_dir / cover_image.name
     if not cover_src.exists():
         cover_src = cover_image
-    _replace_file(cover_src, kaggle_dir / cover_image.name)
+    replace_file(cover_src, kaggle_dir / cover_image.name)
 
     # LICENSE — straight copy, no rewriting.
     license_src = release_dir / "LICENSE"
     if license_src.exists():
-        _replace_file(license_src, kaggle_dir / "LICENSE")
+        replace_file(license_src, kaggle_dir / "LICENSE")
 
     # README.md — real copy with link rewriting so ``../`` links and
     # the directory diagram resolve correctly on the Kaggle dataset
@@ -873,7 +819,7 @@ def assemble_upload_dir(
     # Per-tier bundles — full directory copies.
     for tier in tiers:
         tier_src = release_dir / tier
-        _replace_dir(tier_src, kaggle_dir / tier)
+        replace_dir(tier_src, kaggle_dir / tier)
 
 
 # ---------------------------------------------------------------------------
@@ -941,9 +887,24 @@ def run_packager(
     errors: list[ValidationError] = []
     errors.extend(validate_metadata(metadata))
     errors.extend(validate_cover_image(cover_image))
-    errors.extend(_validate_readme_substitution(release_dir))
+    errors.extend(validate_readme_substitution(release_dir, packager_name="Kaggle"))
 
     metadata_path = kaggle_dir / "dataset-metadata.json"
+
+    # Validation gate: if anything failed, return errors WITHOUT writing
+    # any artifact.  Writing on validation failure leaves a corrupt
+    # metadata file on disk that can be accidentally committed; the
+    # earlier draft did this and PR 5.2's self-review caught it.  The
+    # CLI converts ``errors`` into rc=1; callers who want a record of
+    # the corrupt artifact can dry-run after fixing inputs.
+    if errors:
+        return PackagerOutcome(
+            metadata=metadata,
+            metadata_path=metadata_path,
+            errors=tuple(errors),
+            assembled=False,
+        )
+
     metadata_path.parent.mkdir(parents=True, exist_ok=True)
     metadata_path.write_text(render_metadata_json(metadata), encoding="utf-8")
 
@@ -953,42 +914,11 @@ def run_packager(
     return PackagerOutcome(
         metadata=metadata,
         metadata_path=metadata_path,
-        errors=tuple(errors),
+        errors=(),
         assembled=not dry_run,
     )
 
 
-def _validate_readme_substitution(release_dir: Path) -> list[ValidationError]:
-    """Guard against silent drift between the README's tree diagram
-    and ``KAGGLE_TREE_BLOCK``.
-
-    ``_kaggle_readme_text`` substitutes the source-repo tree diagram
-    for the upload-tree diagram via plain string replace.  If the
-    README's tree changes by even one whitespace character, the
-    substitution silently no-ops and the published Kaggle dataset
-    card shows the source-repo tree (with ``intermediate_instructor/``,
-    ``notebooks/``, ``validation/``).  We catch that case here.
-    """
-
-    readme = release_dir / "README.md"
-    if not readme.exists():
-        return []  # No README is itself a release-day issue, but not this validator's concern.
-    if KAGGLE_TREE_BLOCK not in readme.read_text(encoding="utf-8"):
-        return [
-            ValidationError(
-                field="release/README.md",
-                message=(
-                    "KAGGLE_TREE_BLOCK not found verbatim in release/README.md; "
-                    "the source-repo tree diagram in the README has drifted from "
-                    "the constant in scripts/package_kaggle_release.py — the "
-                    "Kaggle description rewrite will silently no-op until the "
-                    "README and KAGGLE_TREE_BLOCK are reconciled."
-                ),
-            )
-        ]
-    return []
-
-
 # ---------------------------------------------------------------------------
 # CLI
 # ---------------------------------------------------------------------------
diff --git a/tests/scripts/test_package_hf_release.py b/tests/scripts/test_package_hf_release.py
index 1328c8d..0d0a9ea 100644
--- a/tests/scripts/test_package_hf_release.py
+++ b/tests/scripts/test_package_hf_release.py
@@ -14,7 +14,7 @@
 * the README link-rewriting that lets the published dataset card on
   HF keep working ``../`` links (rewritten to GitHub blob URLs) and a
   directory diagram that reflects the upload layout, plus a guard
-  that the source ``HF_TREE_BLOCK_SOURCE`` is still present verbatim
+  that the source ``SOURCE_TREE_BLOCK`` (in ``_release_common``) is still present verbatim
   in the README (silent-failure trap; mirrors PR 5.1's KAGGLE block)
 * the assembled upload tree resolves every declared resource path
 * the safety net that refuses to assemble into ``cwd`` /
@@ -47,6 +47,7 @@
 _RELEASE_BUNDLES_PRESENT = (_RELEASE_DIR / "intro" / "manifest.json").exists()
 _INSTRUCTOR_BUNDLE_PRESENT = (_RELEASE_DIR / "intermediate_instructor" / "manifest.json").exists()
 _COMMITTED_README = _REPO_ROOT / "release" / "huggingface" / "README.md"
+_COMMITTED_INSTRUCTOR_README = _REPO_ROOT / "release" / "huggingface-instructor" / "README.md"
 _COMMITTED_COVER = _REPO_ROOT / "release" / "dataset-cover-image.png"
 
 
@@ -120,7 +121,7 @@ def test_validate_card_reports_every_field_violation() -> None:
     assert "configs" in fields
 
 
-def test_validate_card_requires_exactly_one_default(tmp_path: Path) -> None:
+def test_validate_card_requires_exactly_one_default() -> None:
     """G12.2 — exactly one config carries ``default: true``."""
 
     base = _minimal_card()
@@ -184,21 +185,33 @@ def test_validate_card_flags_data_files_without_split_or_path() -> None:
 # ---------------------------------------------------------------------------
 
 
-def test_render_yaml_frontmatter_round_trips_through_pyyaml() -> None:
+@pytest.mark.parametrize(
+    "pretty_name",
+    [
+        packager.DEFAULT_PRETTY_NAME,
+        # Em-dash flavor — matches the instructor variant; locks down
+        # PyYAML round-tripping for non-ASCII content (regression
+        # guard for the PR 5.2 self-review note about the hand-rolled
+        # renderer's incomplete coverage).
+        packager.DEFAULT_INSTRUCTOR_PRETTY_NAME,
+    ],
+)
+def test_render_yaml_frontmatter_round_trips_through_pyyaml(pretty_name: str) -> None:
     """The rendered YAML must parse cleanly via ``yaml.safe_load``.
 
-    HF parses dataset-card frontmatter with PyYAML; if our hand-rolled
-    renderer drifts from valid YAML, the dataset card silently drops
-    its metadata on the HF page.
+    HF parses dataset-card frontmatter with PyYAML; if our renderer
+    drifts from valid YAML, the dataset card silently drops its
+    metadata on the HF page.
     """
 
-    card = _minimal_card()
+    base = _minimal_card()
+    card = packager.HuggingFaceCard(**{**base.__dict__, "pretty_name": pretty_name})
     yaml_text = packager.render_yaml_frontmatter(card)
     # ``render_yaml_frontmatter`` includes the leading and trailing
     # ``---`` markers.  Strip them before feeding to safe_load.
     inner = yaml_text.strip().strip("-").strip()
     parsed = yaml.safe_load(inner)
-    assert parsed["pretty_name"] == card.pretty_name
+    assert parsed["pretty_name"] == pretty_name
     assert parsed["license"] == "mit"
     assert parsed["language"] == ["en"]
     assert parsed["task_categories"] == [packager.HF_TASK_CATEGORY]
@@ -216,22 +229,8 @@ def test_render_yaml_tags_sorted_at_render_time() -> None:
         **{**base.__dict__, "tags": ("zebra", "alpha", "mango")},
     )
     rendered = packager.render_yaml_frontmatter(shuffled)
-    # Tag block lines, in the rendered order.
-    tag_lines = [
-        line.strip().lstrip("- ") for line in rendered.splitlines() if line.startswith("  - ")
-    ]
-    # The first three list-item lines are language, task_categories,
-    # size_categories — find tag block by position after ``tags:``.
-    lines = rendered.splitlines()
-    tags_idx = lines.index("tags:")
-    block: list[str] = []
-    for line in lines[tags_idx + 1 :]:
-        if line.startswith("  - "):
-            block.append(line[4:].strip())
-        else:
-            break
-    assert block == ["alpha", "mango", "zebra"]
-    _ = tag_lines  # silence linter
+    parsed = yaml.safe_load(rendered.strip().strip("-").strip())
+    assert parsed["tags"] == ["alpha", "mango", "zebra"]
 
 
 # ---------------------------------------------------------------------------
@@ -240,9 +239,9 @@ def test_render_yaml_tags_sorted_at_render_time() -> None:
 
 
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
-def test_hf_readme_text_rewrites_links_and_tree_diagram() -> None:
+def test_hf_public_readme_text_rewrites_links_and_tree_diagram() -> None:
     readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8")
-    rewritten = packager._hf_readme_text(readme)
+    rewritten = packager._hf_public_readme_text(readme)
 
     # Source-repo tree → upload tree.
     assert "intermediate_instructor/" not in rewritten
@@ -260,10 +259,10 @@ def test_hf_readme_text_rewrites_links_and_tree_diagram() -> None:
 
 
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
-def test_hf_tree_block_source_present_in_release_readme() -> None:
+def test_source_tree_block_is_present_in_release_readme() -> None:
     """Silent-failure guard.
 
-    ``_hf_readme_text`` substitutes ``HF_TREE_BLOCK_SOURCE`` →
+    ``_hf_public_readme_text`` substitutes ``SOURCE_TREE_BLOCK`` →
     ``HF_UPLOAD_TREE_BLOCK`` via plain string replace.  If anyone
     tweaks the README's tree diagram by even one whitespace
     character, the substitution silently no-ops and the published HF
@@ -272,28 +271,28 @@ def test_hf_tree_block_source_present_in_release_readme() -> None:
     """
 
     readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8")
-    assert packager.HF_TREE_BLOCK_SOURCE in readme, (
-        "scripts/package_hf_release.py HF_TREE_BLOCK_SOURCE no longer matches "
+    assert packager.SOURCE_TREE_BLOCK in readme, (
+        "scripts/_release_common.py SOURCE_TREE_BLOCK no longer matches "
         "the tree diagram in release/README.md — reconcile the two before "
         "the next HF README regeneration."
     )
 
 
 def test_validate_readme_substitution_flags_drift(tmp_path: Path) -> None:
-    """``_validate_readme_substitution`` is wired into the run-time
-    validator, not just the static guard above."""
+    """``validate_readme_substitution`` (now in ``_release_common``) is
+    wired into the run-time validator, not just the static guard above."""
 
     fake_release = tmp_path / "release"
     fake_release.mkdir()
     (fake_release / "README.md").write_text("# Some unrelated README\n", encoding="utf-8")
-    errors = packager._validate_readme_substitution(fake_release)
+    errors = packager.validate_readme_substitution(fake_release, packager_name="HF")
     assert errors
     assert errors[0].field == "release/README.md"
-    assert "HF_TREE_BLOCK_SOURCE" in errors[0].message
+    assert "SOURCE_TREE_BLOCK" in errors[0].message
 
     if _RELEASE_BUNDLES_PRESENT:
         # Sanity: the real release README does NOT trigger the validator.
-        assert packager._validate_readme_substitution(_RELEASE_DIR) == []
+        assert packager.validate_readme_substitution(_RELEASE_DIR, packager_name="HF") == []
 
 
 # ---------------------------------------------------------------------------
@@ -307,9 +306,9 @@ def test_assemble_upload_dir_rejects_unsafe_huggingface_dir(tmp_path: Path) -> N
     fake_release = tmp_path / "release"
     fake_release.mkdir()
     with pytest.raises(ValueError, match="unsafe"):
-        packager.assemble_upload_dir(fake_release, fake_release)
+        packager.assemble_upload_dir(fake_release, fake_release, rendered_readme="")
     with pytest.raises(ValueError, match="unsafe"):
-        packager.assemble_upload_dir(fake_release, fake_release.parent)
+        packager.assemble_upload_dir(fake_release, fake_release.parent, rendered_readme="")
 
 
 def test_assemble_upload_dir_rejects_huggingface_dir_equal_to_cwd(
@@ -328,7 +327,48 @@ def test_assemble_upload_dir_rejects_huggingface_dir_equal_to_cwd(
     cwd.mkdir()
     monkeypatch.chdir(cwd)
     with pytest.raises(ValueError, match="unsafe"):
-        packager.assemble_upload_dir(fake_release, cwd)
+        packager.assemble_upload_dir(fake_release, cwd, rendered_readme="")
+
+
+def test_assemble_upload_dir_rejects_descendant_of_release_dir(tmp_path: Path) -> None:
+    """Refuse to assemble into a strict descendant of ``release_dir``.
+
+    Per PR 5.2 self-review #3: ``--huggingface-dir release/intro``
+    would otherwise rmtree the intro tier bundle.  Only direct
+    children of ``release_dir`` are allowed (the canonical
+    ``release/huggingface``, ``release/huggingface-instructor``,
+    ``release/kaggle`` shapes).
+    """
+
+    fake_release = tmp_path / "release"
+    (fake_release / "intro" / "tables").mkdir(parents=True)
+    deep = fake_release / "intro" / "tables"  # 2 levels under release_dir
+    with pytest.raises(ValueError, match="unsafe"):
+        packager.assemble_upload_dir(fake_release, deep, rendered_readme="")
+
+
+def test_assemble_upload_dir_allows_canonical_child(tmp_path: Path) -> None:
+    """Direct-child of release_dir IS the canonical safe location.
+
+    ``release/huggingface`` is allowed; only deeper nesting trips the
+    descendant guard.
+    """
+
+    fake_release = tmp_path / "release"
+    fake_release.mkdir()
+    safe_child = fake_release / "huggingface"
+    # Should not raise.  The function may still fail later because the
+    # source bundles aren't present, but the safety guard must let it
+    # through.
+    try:
+        packager.assemble_upload_dir(fake_release, safe_child, rendered_readme="")
+    except ValueError as exc:
+        if "unsafe" in str(exc):
+            raise AssertionError(f"safety guard rejected canonical child: {exc}") from exc
+    except FileNotFoundError:
+        # Expected — fake_release has no tier bundles, so the copytree
+        # call fails after the safety check.  That's the right shape.
+        pass
 
 
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
@@ -358,7 +398,7 @@ def test_assembled_upload_dir_resolves_every_declared_data_file(tmp_path: Path)
 
     # Top-level required artefacts.
     assert (upload_dir / "README.md").is_file()
-    assert (upload_dir / packager.COVER_IMAGE_FILENAME).is_file()
+    assert (upload_dir / _COMMITTED_COVER.name).is_file()
     assert (upload_dir / "LICENSE").is_file()
 
 
@@ -402,6 +442,56 @@ def test_main_reports_missing_release_dir(
     assert "release directory not found" in captured.err
 
 
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None:
+    """Validation failure must NOT leave a corrupt README on disk
+    (PR 5.2 self-review #1).
+
+    Forces a validation failure by passing a cover image that is too
+    small.  Asserts the README path doesn't materialise and
+    ``outcome.errors`` is populated.
+    """
+
+    tiny_cover = tmp_path / "tiny.png"
+    from PIL import Image
+
+    Image.new("RGB", (10, 10), (0, 0, 0)).save(tiny_cover)
+    out_dir = tmp_path / "huggingface"
+    outcome = packager.run_packager(_RELEASE_DIR, huggingface_dir=out_dir, cover_image=tiny_cover)
+    assert outcome.errors, "expected at least one validation error"
+    assert not (out_dir / "README.md").exists(), "README must not be written when validation fails"
+    assert outcome.assembled is False
+
+
+def test_main_rejects_default_config_with_instructor_variant(
+    tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    """``--default-config`` is meaningless for ``--variant=instructor``
+    (only one config); silently accepting it is a misconfiguration
+    (PR 5.2 self-review #10).
+    """
+
+    rc = packager.main(
+        [
+            "--release-dir",
+            str(_RELEASE_DIR),
+            "--huggingface-dir",
+            str(tmp_path / "hf"),
+            "--variant",
+            "instructor",
+            "--default-config",
+            "advanced",
+            "--cover-image",
+            str(_COMMITTED_COVER),
+            "--dry-run",
+        ]
+    )
+    captured = capsys.readouterr()
+    assert rc == 2
+    assert "--default-config" in captured.err
+    assert "instructor" in captured.err
+
+
 # ---------------------------------------------------------------------------
 # Determinism + sync with committed artefact
 # ---------------------------------------------------------------------------
@@ -473,6 +563,32 @@ def test_committed_hf_readme_matches_fresh_regeneration(tmp_path: Path) -> None:
     assert packager.GITHUB_BLOB_BASE in body
 
 
+@pytest.mark.skipif(
+    not (_INSTRUCTOR_BUNDLE_PRESENT and _COMMITTED_INSTRUCTOR_README.exists()),
+    reason="instructor bundle or committed instructor README missing",
+)
+def test_committed_instructor_readme_matches_fresh_regeneration(tmp_path: Path) -> None:
+    """Audit-artifact-sync for the instructor companion.
+
+    A fresh instructor regeneration must match
+    ``release/huggingface-instructor/README.md`` byte-for-byte.  Locks
+    down the dedicated ``INSTRUCTOR_BODY`` constant introduced in PR
+    5.2 self-review #2.
+    """
+
+    fresh_dir = tmp_path / "huggingface-instructor"
+    packager.run_packager(
+        _RELEASE_DIR,
+        huggingface_dir=fresh_dir,
+        variant="instructor",
+        cover_image=_COMMITTED_COVER,
+        dry_run=True,
+    )
+    fresh_bytes = (fresh_dir / "README.md").read_bytes()
+    committed_bytes = _COMMITTED_INSTRUCTOR_README.read_bytes()
+    assert fresh_bytes == committed_bytes
+
+
 # ---------------------------------------------------------------------------
 # Instructor companion (G12.4)
 # ---------------------------------------------------------------------------
@@ -500,10 +616,21 @@ def test_run_packager_instructor_variant_packages_independently(tmp_path: Path)
     # ``intermediate/`` in the upload tree.
     for df in only.data_files:
         assert (upload_dir / df.path).is_file()
-    # Instructor body uses the instructor tree block.
+    # Instructor body is the dedicated INSTRUCTOR_BODY constant, not
+    # the public README inlined verbatim — locks down PR 5.2 self-
+    # review fix #2 (3-tier prose was leaking into the 1-tier card).
     body = outcome.card.body
-    assert "metadata/" in body  # instructor metadata sibling visible
-    assert "9 full-horizon tables" in body
+    assert body is packager.INSTRUCTOR_BODY
+    # Public-tier names must NOT appear in the instructor body.
+    assert "intro" not in body.lower().split()  # word boundary, not substring
+    # The instructor body talks about the instructor companion role.
+    assert "Instructor companion" in body
+    assert "redaction" in body.lower()
+    assert "metadata/world_spec" in body
+    # Tree block reflects the instructor (single-tier) layout.
+    assert "intermediate/" in body
+    # No leakage of public tree elements.
+    assert "intro/ intermediate/ advanced/" not in body
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py
index 351950b..3790b5f 100644
--- a/tests/scripts/test_package_kaggle_release.py
+++ b/tests/scripts/test_package_kaggle_release.py
@@ -7,8 +7,8 @@
 * the README link-rewriting that lets the published dataset card on
   Kaggle keep working ``../`` links (rewritten to GitHub blob URLs)
   and a directory diagram that reflects the upload layout, plus a
-  guard that the source ``KAGGLE_TREE_BLOCK`` is still present
-  verbatim in the README (silent-failure trap)
+  guard that the source ``SOURCE_TREE_BLOCK`` (in ``_release_common``)
+  is still present verbatim in the README (silent-failure trap)
 * the assembled upload tree resolves every declared resource path
   (so ``kaggle datasets create`` can find each file)
 * the safety net that refuses to assemble into ``cwd`` /
@@ -224,10 +224,10 @@ def test_kaggle_readme_text_rewrites_links_and_tree_diagram() -> None:
 
 
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
-def test_kaggle_tree_block_is_present_in_release_readme() -> None:
+def test_source_tree_block_is_present_in_release_readme() -> None:
     """Silent-failure guard.
 
-    ``_kaggle_readme_text`` substitutes ``KAGGLE_TREE_BLOCK`` →
+    ``_kaggle_readme_text`` substitutes ``SOURCE_TREE_BLOCK`` →
     ``KAGGLE_UPLOAD_TREE_BLOCK`` via plain string replace.  If anyone
     tweaks the README's tree diagram by even one whitespace
     character, the substitution silently no-ops and the published
@@ -236,8 +236,8 @@ def test_kaggle_tree_block_is_present_in_release_readme() -> None:
     """
 
     readme = (_RELEASE_DIR / "README.md").read_text(encoding="utf-8")
-    assert packager.KAGGLE_TREE_BLOCK in readme, (
-        "scripts/package_kaggle_release.py KAGGLE_TREE_BLOCK no longer matches "
+    assert packager.SOURCE_TREE_BLOCK in readme, (
+        "scripts/_release_common.py SOURCE_TREE_BLOCK no longer matches "
         "the tree diagram in release/README.md — reconcile the two before "
         "the next release-metadata regeneration."
     )
@@ -245,19 +245,19 @@ def test_kaggle_tree_block_is_present_in_release_readme() -> None:
 
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
 def test_validate_readme_substitution_flags_drift(tmp_path: Path) -> None:
-    """``_validate_readme_substitution`` is wired into the run-time
-    validator, not just the static guard above."""
+    """``validate_readme_substitution`` (now in ``_release_common``) is
+    wired into the run-time validator, not just the static guard above."""
 
     fake_release = tmp_path / "release"
     fake_release.mkdir()
     (fake_release / "README.md").write_text("# Some unrelated README\n", encoding="utf-8")
-    errors = packager._validate_readme_substitution(fake_release)
+    errors = packager.validate_readme_substitution(fake_release, packager_name="Kaggle")
     assert errors
     assert errors[0].field == "release/README.md"
-    assert "KAGGLE_TREE_BLOCK" in errors[0].message
+    assert "SOURCE_TREE_BLOCK" in errors[0].message
 
     # Sanity: the real release README does NOT trigger the validator.
-    assert packager._validate_readme_substitution(_RELEASE_DIR) == []
+    assert packager.validate_readme_substitution(_RELEASE_DIR, packager_name="Kaggle") == []
 
 
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
@@ -368,6 +368,27 @@ def test_assemble_upload_dir_idempotent_against_existing_tree(tmp_path: Path) ->
 # ---------------------------------------------------------------------------
 
 
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None:
+    """A failed validation must NOT leave a corrupt metadata file on
+    disk (PR 5.2 self-review fix).
+
+    Forces a validation failure by passing a cover image that is too
+    small for Kaggle's 560×280 floor.  Asserts the metadata path
+    doesn't materialise and ``outcome.errors`` is populated.
+    """
+
+    tiny_cover = tmp_path / "tiny.png"
+    Image.new("RGB", (10, 10), (0, 0, 0)).save(tiny_cover)
+    out_dir = tmp_path / "kaggle"
+    outcome = packager.run_packager(_RELEASE_DIR, kaggle_dir=out_dir, cover_image=tiny_cover)
+    assert outcome.errors, "expected at least one validation error"
+    assert not (out_dir / "dataset-metadata.json").exists(), (
+        "metadata file must not be written when validation fails"
+    )
+    assert outcome.assembled is False
+
+
 def test_main_reports_missing_release_dir(
     tmp_path: Path, capsys: pytest.CaptureFixture[str]
 ) -> None:

From 42260a80e9dbea4d12ae725e7912741a89355c6a Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Thu, 7 May 2026 00:03:32 +0300
Subject: [PATCH 3/3] PR 5.2 Copilot-review fixes (Kaggle + HF packagers)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fold Copilot's two real findings on the self-review revision back in.

COPILOT-1 — validate_upload_dir_safe was only invoked inside
assemble_upload_dir, which --dry-run skips.  A dry-run with
--huggingface-dir release (or .) would write the README into the
unsafe path BEFORE the safety net fired.  Hoist the check into
run_packager (both packagers) so it runs before any mkdir or write;
the inner assemble_upload_dir call stays as defence-in-depth for
direct callers.  New tests: dry-run with unsafe upload-dir raises
without writing; the same path through main() returns rc=2.

COPILOT-2 — Cover-image path resolution was inconsistent:
validate_cover_image used cover_image as passed, while
assemble_upload_dir did a separate ``release_dir / cover_image.name``
fallback.  Diverged for bare-basename inputs (false validation
failures) and two-paths-sharing-a-basename (assembler shadowing the
explicit path).  Added resolve_cover_image_path() to
_release_common.py (explicit-wins, release-dir fallback);
run_packager calls it once and threads the resolved path through
validation, the metadata's image field, and assembly.  New
tests/scripts/test_release_common.py covers the four resolution
branches; new packager-side tests confirm bare-basename success +
metadata field plumbing.

COPILOT-3 — outdated; already addressed by self-review fix #8 in
commit f2fc4a2.  Resolved as already treated; no code change.

Verification: 1232/1232 tests pass + 5 gated skips; ruff + mypy
clean; hash determinism PASS 67/67; leakage probes rc=0 on every
tier; validate_release_candidate --no-rebuild exits 0;
BUNDLE_SCHEMA_VERSION unchanged at 5.
release/{kaggle,huggingface,huggingface-instructor}/* artifacts
regenerated byte-identically.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .agent-plan.md                               |   1 +
 scripts/_release_common.py                   |  31 ++++++
 scripts/package_hf_release.py                |  31 ++++--
 scripts/package_kaggle_release.py            |  31 ++++--
 tests/scripts/test_package_hf_release.py     |  89 +++++++++++++++++
 tests/scripts/test_package_kaggle_release.py |  54 ++++++++++
 tests/scripts/test_release_common.py         | 100 +++++++++++++++++++
 7 files changed, 321 insertions(+), 16 deletions(-)
 create mode 100644 tests/scripts/test_release_common.py

diff --git a/.agent-plan.md b/.agent-plan.md
index 6e8c359..6ee4d2d 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -54,6 +54,7 @@ Goal: ship a best-in-class educational synthetic CRM lead-scoring dataset family
 - [x] PR 5.2: `scripts/_release_common.py` (new) — extracted shared release-packaging primitives so the Kaggle + HF packagers share one source of truth: `GITHUB_BLOB_BASE`, the `](../foo)` regex, the validation-report blob-URL substitution, `validate_cover_image` (dimension floor; HF reuses Kaggle's 560×280), and `validate_upload_dir_safe` (refuses to assemble into `cwd` / `release_dir` / its parent / the filesystem anchor).  `FieldDescriptor` / `ResourceSchema` / dtype maps are intentionally NOT extracted — HF infers schema from parquet via `load_dataset` and does not need a Frictionless declaration.  `scripts/package_kaggle_release.py` refactored to import these primitives; PR 5.1's external behaviour and committed `release/kaggle/dataset-metadata.json` are byte-stable across the refactor.
 - [x] PR 5.2: Upload-dir assembly under `release/huggingface/` and `release/huggingface-instructor/` uses real-file copies (not symlinks; mirrors PR 5.1 — the `datasets` library walks the upload tree and silently skips broken symlinks in some versions).  Cover image at `release/dataset-cover-image.png` is reused for the HF thumbnail.  `release/huggingface/*` and `release/huggingface-instructor/*` are gitignored except for `README.md` — only the dataset card is committed; the upload tree is reassembled on demand.
 - [x] PR 5.2: `release/HF_DATASET_CARD.md` (legacy single-file stub) deleted — superseded by the generated `release/huggingface/README.md`.
+- [x] PR 5.2 (Copilot review pass): folded Copilot's two real findings on the self-review revision back in before requesting human review.  (COPILOT-1) `validate_upload_dir_safe` was only called inside `assemble_upload_dir`, which `--dry-run` skips — a user passing `--huggingface-dir release` (or `.`, etc.) in dry-run mode would write a README into the unsafe path before the safety net fired.  Hoisted the check into `run_packager` (both packagers) so it runs BEFORE any mkdir or write; the inner `assemble_upload_dir` call stays as defence-in-depth for callers that bypass `run_packager`.  (COPILOT-2) Cover-image path resolution was inconsistent: `validate_cover_image` used `cover_image` as passed, while `assemble_upload_dir` did its own ``release_dir / cover_image.name`` fallback — diverged for bare-basename inputs (false validation failures) and for two-paths-sharing-a-basename inputs (assembler could shadow the explicit path).  Added `resolve_cover_image_path` to `scripts/_release_common.py` (explicit-wins, with release-dir fallback for bare basenames); `run_packager` calls it once and threads the resolved path through validation, metadata's `image` field, and assembly so every consumer agrees.  (COPILOT-3) outdated docstring: `assemble_upload_dir` no longer claims to write the README it doesn't write — already addressed by self-review fix #8 in commit f2fc4a2; resolved as already treated.  Net: 1232/1232 tests pass + 5 gated skips (4 + new resolver coverage in `tests/scripts/test_release_common.py`); ruff + mypy clean; hash determinism PASS 67/67; leakage probes 0/3 reconstruct on every tier; `validate_release_candidate --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5.
 - [x] PR 5.2 (self-review pass): brutal review of the first revision caught real bugs the reviewer would otherwise have to call out. Fixes folded into the PR before review:  (#1) `run_packager` validate→write order — both packagers were writing the README/metadata even when validation failed; the validation gate now early-returns with `errors` populated and zero artifacts on disk; new tests exercise the no-write path on both sides.  (#2) Instructor README was inlining the public 3-tier README for a 1-tier dataset; replaced with a dedicated `INSTRUCTOR_BODY` constant that opens by linking to the public dataset, describes only the instructor-specific additions (full-horizon tables, hidden DAG, latent registry, mechanism summary), and uses the single-tier tree block.  (#3) `validate_upload_dir_safe` now also rejects strict descendants of `release_dir` (e.g. `--huggingface-dir release/intro` would otherwise rmtree the intro bundle); allow-list keeps the canonical `release/{kaggle,huggingface,huggingface-instructor}` direct-children.  (#4) `[publish]` extra in `pyproject.toml` (`datasets>=2.14`, `kaggle>=1.6`) makes the gated `load_dataset()` / Kaggle-CLI tests installable in a single command — closes the "G12.3/G12.4 untested in CI" gap to a one-line install.  (#5) Shared-primitives extraction finished: `SOURCE_TREE_BLOCK`, `validate_readme_substitution`, `replace_file`, `replace_dir`, `load_manifest` all moved to `scripts/_release_common.py`; both packagers reduced to imports.  (#6) Hand-rolled YAML renderer (60 lines + brittle quoting heuristic) replaced with `yaml.safe_dump` + a 4-line `_IndentedDumper` subclass that forces indent-2 on top-level sequences.  (#7) Dead `--owner` / `--dataset-slug` CLI flags removed (PR 7.2 will add them when actually needed).  (#8) `assemble_upload_dir` now takes `rendered_readme` as a parameter and writes it itself; the public name no longer lies about producing a complete tree.  (#9) `build_config_for_tier` made pure (no I/O); `_assert_tier_dir_exists` does the cheap manifest-stat preflight.  (#10) `--default-config` with `--variant=instructor` now errors instead of silently ignoring.  (#11) Instructor tree-diagram drops the hardcoded "9 tables" claim.  (#13–#16) Visual cleanups (duplicate divider, ruff-split imports, `COVER_IMAGE_FILENAME`-vs-`Path.name` redundancy, speculative comment about HF split rename).  (#17) Test cruft removed (unused `tmp_path`, dead `tag_lines`); em-dash YAML round-trip parametrised for the instructor `pretty_name`.  Net: 1223/1223 tests pass + 5 gated skips (4 `datasets`-SDK round-trip + 1 Kaggle `kaggle`-SDK from PR 5.1); ruff + mypy clean; `scripts/probe_relational_leakage.py release/{intro,intermediate,advanced} --max-accuracy 0.65` exits 0 on every public tier; `scripts/verify_hash_determinism.py` PASS 67/67; `scripts/validate_release_candidate.py --no-rebuild` exits 0; `BUNDLE_SCHEMA_VERSION` unchanged at 5 (this PR doesn't touch the bundle shape).
 
 ### Phase 6 — Notebook sequence + adversarial framing
diff --git a/scripts/_release_common.py b/scripts/_release_common.py
index c57e71e..3a59858 100644
--- a/scripts/_release_common.py
+++ b/scripts/_release_common.py
@@ -146,6 +146,37 @@ def validate_readme_substitution(release_dir: Path, *, packager_name: str) -> li
 COVER_IMAGE_MIN_HEIGHT: Final[int] = 280
 
 
+def resolve_cover_image_path(cover_image: Path, release_dir: Path) -> Path:
+    """Resolve the effective cover-image path used by the packagers.
+
+    The packagers run a validator against the cover image and a copy
+    step against the same image; the earlier draft did the path
+    resolution in the assembler only, which meant validate + assemble
+    could disagree about WHICH file to use (Copilot review on PR #72,
+    addressed in self-review pass #2).
+
+    Resolution rule — explicit-wins, with a release-dir fallback:
+
+    1. If ``cover_image`` exists at the path given, return it as-is.
+    2. Otherwise, if ``release_dir / cover_image.name`` exists, return
+       that — lets users pass a bare basename when the file lives
+       under ``release/``.
+    3. Otherwise return ``cover_image`` unchanged so the cover-image
+       validator surfaces a clean ``not found at <path>`` error.
+
+    Two paths sharing a basename DO NOT cause the release-dir copy to
+    silently shadow the explicit path — the explicit path's existence
+    is the first check and wins.
+    """
+
+    if cover_image.exists():
+        return cover_image
+    fallback = release_dir / cover_image.name
+    if fallback.exists():
+        return fallback
+    return cover_image
+
+
 def validate_cover_image(path: Path) -> list[ValidationError]:
     """Validate that ``path`` exists and meets the dimension floor."""
 
diff --git a/scripts/package_hf_release.py b/scripts/package_hf_release.py
index 382bed5..31ca3fd 100644
--- a/scripts/package_hf_release.py
+++ b/scripts/package_hf_release.py
@@ -63,6 +63,7 @@
     load_manifest,
     replace_dir,
     replace_file,
+    resolve_cover_image_path,
     rewrite_release_links,
     validate_cover_image,
     validate_readme_substitution,
@@ -685,12 +686,11 @@ def assemble_upload_dir(
         readme_path.unlink()
     readme_path.write_text(rendered_readme, encoding="utf-8")
 
-    # Cover image — reuse the one ``release/`` already ships.
-    cover_src = release_dir / cover_image.name
-    if not cover_src.exists():
-        cover_src = cover_image
-    if cover_src.exists():
-        replace_file(cover_src, upload_dir / cover_image.name)
+    # Cover image — copy the resolved path as-is.  Path resolution
+    # happens once in ``run_packager`` via ``resolve_cover_image_path``
+    # so the validator and the assembler agree on which file to use.
+    if cover_image.exists():
+        replace_file(cover_image, upload_dir / cover_image.name)
 
     # LICENSE — straight copy, no rewriting.
     license_src = release_dir / "LICENSE"
@@ -754,6 +754,21 @@ def run_packager(
     if not release_dir.exists():
         raise FileNotFoundError(f"release directory not found: {release_dir}")
 
+    # Hoist the upload-dir safety check BEFORE any mkdir or write —
+    # including the dry-run path (Copilot review on PR #72).  The
+    # earlier draft only checked inside ``assemble_upload_dir``, so a
+    # dry run with ``--huggingface-dir .`` would happily write a
+    # README into ``cwd`` before the safety guard ever ran.
+    # ``assemble_upload_dir`` retains its own call as defence-in-depth
+    # for callers that bypass ``run_packager``.
+    kind = "huggingface" if variant == "public" else "huggingface-instructor"
+    validate_upload_dir_safe(huggingface_dir, release_dir, kind=kind)
+
+    # Resolve cover image once — the same path is used for validation
+    # and assembly so the two cannot disagree (Copilot review on
+    # PR #72).
+    resolved_cover = resolve_cover_image_path(cover_image, release_dir)
+
     card = build_card(
         release_dir,
         variant=variant,
@@ -764,7 +779,7 @@ def run_packager(
 
     errors: list[ValidationError] = []
     errors.extend(validate_card(card))
-    errors.extend(validate_cover_image(cover_image))
+    errors.extend(validate_cover_image(resolved_cover))
     if variant == "public":
         # The instructor body doesn't substitute SOURCE_TREE_BLOCK —
         # it's a self-contained markdown — so the substitution guard
@@ -795,7 +810,7 @@ def run_packager(
             huggingface_dir,
             variant=variant,
             rendered_readme=rendered,
-            cover_image=cover_image,
+            cover_image=resolved_cover,
         )
 
     return PackagerOutcome(
diff --git a/scripts/package_kaggle_release.py b/scripts/package_kaggle_release.py
index c83d5c9..2de9401 100644
--- a/scripts/package_kaggle_release.py
+++ b/scripts/package_kaggle_release.py
@@ -72,6 +72,7 @@
     load_manifest,
     replace_dir,
     replace_file,
+    resolve_cover_image_path,
     rewrite_release_links,
     validate_cover_image,
     validate_readme_substitution,
@@ -791,11 +792,11 @@ def assemble_upload_dir(
     validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle")
     kaggle_dir.mkdir(parents=True, exist_ok=True)
 
-    # Cover image.
-    cover_src = release_dir / cover_image.name
-    if not cover_src.exists():
-        cover_src = cover_image
-    replace_file(cover_src, kaggle_dir / cover_image.name)
+    # Cover image — copy the resolved path as-is.  Path resolution
+    # happens once in ``run_packager`` via ``resolve_cover_image_path``
+    # so the validator and the assembler agree on which file to use.
+    if cover_image.exists():
+        replace_file(cover_image, kaggle_dir / cover_image.name)
 
     # LICENSE — straight copy, no rewriting.
     license_src = release_dir / "LICENSE"
@@ -868,6 +869,20 @@ def run_packager(
     if not release_dir.exists():
         raise FileNotFoundError(f"release directory not found: {release_dir}")
 
+    # Hoist the upload-dir safety check BEFORE any mkdir or write —
+    # including the dry-run path (Copilot review on PR #72).  The
+    # earlier draft only checked inside ``assemble_upload_dir``, so a
+    # dry run with ``--kaggle-dir .`` would happily write
+    # ``dataset-metadata.json`` into ``cwd`` before the safety guard
+    # ever ran.  ``assemble_upload_dir`` retains its own call as
+    # defence-in-depth for callers that bypass ``run_packager``.
+    validate_upload_dir_safe(kaggle_dir, release_dir, kind="kaggle")
+
+    # Resolve cover image once — the same path is used for validation,
+    # the metadata's ``image`` field, and the upload-tree copy so they
+    # cannot disagree (Copilot review on PR #72).
+    resolved_cover = resolve_cover_image_path(cover_image, release_dir)
+
     metadata = build_metadata(
         release_dir,
         tiers=tiers,
@@ -881,12 +896,12 @@ def run_packager(
         license_name=license_name,
         update_frequency=update_frequency,
         user_sources=user_sources,
-        cover_image=cover_image,
+        cover_image=resolved_cover,
     )
 
     errors: list[ValidationError] = []
     errors.extend(validate_metadata(metadata))
-    errors.extend(validate_cover_image(cover_image))
+    errors.extend(validate_cover_image(resolved_cover))
     errors.extend(validate_readme_substitution(release_dir, packager_name="Kaggle"))
 
     metadata_path = kaggle_dir / "dataset-metadata.json"
@@ -909,7 +924,7 @@ def run_packager(
     metadata_path.write_text(render_metadata_json(metadata), encoding="utf-8")
 
     if not dry_run:
-        assemble_upload_dir(release_dir, kaggle_dir, tiers=tiers, cover_image=cover_image)
+        assemble_upload_dir(release_dir, kaggle_dir, tiers=tiers, cover_image=resolved_cover)
 
     return PackagerOutcome(
         metadata=metadata,
diff --git a/tests/scripts/test_package_hf_release.py b/tests/scripts/test_package_hf_release.py
index 0d0a9ea..f5bfc2a 100644
--- a/tests/scripts/test_package_hf_release.py
+++ b/tests/scripts/test_package_hf_release.py
@@ -442,6 +442,95 @@ def test_main_reports_missing_release_dir(
     assert "release directory not found" in captured.err
 
 
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_rejects_unsafe_huggingface_dir_in_dry_run(tmp_path: Path) -> None:
+    """Copilot review on PR #72 item #1.
+
+    The earlier draft only checked ``--huggingface-dir`` safety
+    inside ``assemble_upload_dir``, which the dry-run path skips.
+    A user passing ``--huggingface-dir release`` (i.e. ``release_dir``
+    itself) in dry-run mode would write a README into ``release/``
+    before the safety net fired.
+
+    With the hoisted check, dry-run mode also raises ``ValueError``
+    BEFORE any mkdir or write.
+    """
+
+    sentinel = _RELEASE_DIR / "README.md"
+    pre = sentinel.read_bytes() if sentinel.exists() else None
+
+    with pytest.raises(ValueError, match="unsafe"):
+        packager.run_packager(
+            _RELEASE_DIR,
+            huggingface_dir=_RELEASE_DIR,  # release_dir itself — unsafe
+            cover_image=_COMMITTED_COVER,
+            dry_run=True,
+        )
+
+    # The release README must not have been clobbered by a write that
+    # raced past the (previously absent) safety check.
+    if pre is not None:
+        assert sentinel.read_bytes() == pre
+
+
+def test_main_rejects_unsafe_huggingface_dir_via_cli(
+    tmp_path: Path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    """Same as the test above but via the ``main()`` CLI entry point.
+
+    Confirms the CLI returns rc=2 for an unsafe ``--huggingface-dir``
+    in dry-run mode (the FileNotFoundError / ValueError handler in
+    ``main`` catches the hoisted ValueError).
+    """
+
+    fake_release = tmp_path / "release"
+    fake_release.mkdir()
+    rc = packager.main(
+        [
+            "--release-dir",
+            str(fake_release),
+            "--huggingface-dir",
+            str(fake_release),  # release_dir itself
+            "--cover-image",
+            str(_COMMITTED_COVER) if _COMMITTED_COVER.exists() else str(tmp_path / "cover.png"),
+            "--dry-run",
+        ]
+    )
+    captured = capsys.readouterr()
+    assert rc == 2
+    assert "unsafe" in captured.err
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_resolves_cover_image_via_release_fallback(tmp_path: Path) -> None:
+    """Copilot review on PR #72 item #2.
+
+    A user passing a bare basename that exists under ``release/`` but
+    not at cwd should succeed: ``resolve_cover_image_path`` falls back
+    to the release-dir sibling, and the validator + assembler both
+    use that resolved path.
+    """
+
+    upload_dir = tmp_path / "huggingface"
+    bare = Path(_COMMITTED_COVER.name)  # bare basename
+
+    # Sanity: the bare path doesn't exist at cwd (or wherever pytest
+    # runs from); it MUST resolve via the release-dir fallback.
+    assert not bare.is_absolute()
+
+    outcome = packager.run_packager(
+        _RELEASE_DIR,
+        huggingface_dir=upload_dir,
+        cover_image=bare,
+        dry_run=True,
+    )
+    assert outcome.errors == ()
+    # The README mentions the cover image filename — a smoke check
+    # that the resolved path went through the rest of the pipeline.
+    written = (upload_dir / "README.md").read_text(encoding="utf-8")
+    assert _COMMITTED_COVER.name in written
+
+
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
 def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None:
     """Validation failure must NOT leave a corrupt README on disk
diff --git a/tests/scripts/test_package_kaggle_release.py b/tests/scripts/test_package_kaggle_release.py
index 3790b5f..8140c23 100644
--- a/tests/scripts/test_package_kaggle_release.py
+++ b/tests/scripts/test_package_kaggle_release.py
@@ -368,6 +368,60 @@ def test_assemble_upload_dir_idempotent_against_existing_tree(tmp_path: Path) ->
 # ---------------------------------------------------------------------------
 
 
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_rejects_unsafe_kaggle_dir_in_dry_run(tmp_path: Path) -> None:
+    """Copilot review on PR #72 item #1.
+
+    The earlier draft only checked ``--kaggle-dir`` safety inside
+    ``assemble_upload_dir``, which dry-run skips.  A user passing
+    ``--kaggle-dir release`` (i.e. ``release_dir`` itself) in dry-run
+    would write ``dataset-metadata.json`` into ``release/`` before
+    the safety net fired.  With the hoisted check, dry-run also
+    raises ``ValueError`` BEFORE any mkdir or write.
+    """
+
+    cover = tmp_path / "cover.png"
+    Image.new("RGB", (1280, 640), (0, 0, 0)).save(cover)
+
+    with pytest.raises(ValueError, match="unsafe"):
+        packager.run_packager(
+            _RELEASE_DIR,
+            kaggle_dir=_RELEASE_DIR,  # release_dir itself
+            cover_image=cover,
+            dry_run=True,
+        )
+
+    # ``release/dataset-metadata.json`` must not exist at the top
+    # level (it's gitignored anyway, but a stray write would still
+    # show up in ``git status``).
+    assert not (_RELEASE_DIR / "dataset-metadata.json").exists()
+
+
+@pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
+def test_run_packager_resolves_cover_image_via_release_fallback(tmp_path: Path) -> None:
+    """Copilot review on PR #72 item #2.
+
+    A bare-basename ``--cover-image`` that exists under ``release/``
+    must validate (via ``resolve_cover_image_path`` fallback) and
+    materialise into the assembled metadata's ``image`` field.
+    """
+
+    out_dir = tmp_path / "kaggle"
+    bare = Path(_COMMITTED_COVER.name)
+    assert not bare.is_absolute()
+
+    outcome = packager.run_packager(
+        _RELEASE_DIR,
+        kaggle_dir=out_dir,
+        cover_image=bare,
+        dry_run=True,
+    )
+    assert outcome.errors == ()
+    # Metadata's ``image`` field carries the resolved filename.
+    parsed = json.loads((out_dir / "dataset-metadata.json").read_text(encoding="utf-8"))
+    assert parsed["image"] == _COMMITTED_COVER.name
+
+
 @pytest.mark.skipif(not _RELEASE_BUNDLES_PRESENT, reason="release bundles not present")
 def test_run_packager_does_not_write_on_validation_failure(tmp_path: Path) -> None:
     """A failed validation must NOT leave a corrupt metadata file on
diff --git a/tests/scripts/test_release_common.py b/tests/scripts/test_release_common.py
new file mode 100644
index 0000000..23bbbbf
--- /dev/null
+++ b/tests/scripts/test_release_common.py
@@ -0,0 +1,100 @@
+"""Tests for ``scripts/_release_common.py`` shared release primitives.
+
+The link-rewriter, source-tree-block guard, cover-image validator,
+and upload-dir safety check have packager-side coverage in
+``test_package_kaggle_release.py`` and ``test_package_hf_release.py``.
+This file covers shared helpers that don't have a natural packager-
+side home — currently the cover-image path resolver introduced in
+response to the Copilot review on PR #72.
+"""
+
+from __future__ import annotations
+
+import importlib.util
+import sys
+from pathlib import Path
+
+from PIL import Image
+
+_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "_release_common.py"
+_spec = importlib.util.spec_from_file_location("_release_common", _SCRIPT_PATH)
+assert _spec is not None
+assert _spec.loader is not None
+common = importlib.util.module_from_spec(_spec)
+sys.modules["_release_common"] = common
+_spec.loader.exec_module(common)
+
+
+def _make_cover(path: Path) -> None:
+    """Write a minimum-Kaggle-acceptable cover image at ``path``."""
+
+    Image.new("RGB", (1280, 640), (0, 0, 0)).save(path)
+
+
+# ---------------------------------------------------------------------------
+# resolve_cover_image_path — Copilot review on PR #72, item #2
+# ---------------------------------------------------------------------------
+
+
+def test_resolve_cover_image_path_returns_explicit_when_it_exists(tmp_path: Path) -> None:
+    """An explicit path that exists is returned unchanged."""
+
+    release = tmp_path / "release"
+    release.mkdir()
+    explicit = tmp_path / "explicit.png"
+    _make_cover(explicit)
+
+    resolved = common.resolve_cover_image_path(explicit, release)
+    assert resolved == explicit
+
+
+def test_resolve_cover_image_path_falls_back_to_release_dir(tmp_path: Path) -> None:
+    """A bare basename that doesn't exist resolves to ``release_dir/<name>``."""
+
+    release = tmp_path / "release"
+    release.mkdir()
+    cover_in_release = release / "dataset-cover-image.png"
+    _make_cover(cover_in_release)
+
+    resolved = common.resolve_cover_image_path(Path("dataset-cover-image.png"), release)
+    assert resolved == cover_in_release
+
+
+def test_resolve_cover_image_path_prefers_explicit_over_release_sibling(
+    tmp_path: Path,
+) -> None:
+    """When BOTH the explicit path and a release-dir sibling exist, the
+    explicit path wins.  Locks down the contract that two paths sharing
+    a basename do not silently shadow each other (Copilot review #2).
+    """
+
+    release = tmp_path / "release"
+    release.mkdir()
+    explicit = tmp_path / "explicit.png"
+    _make_cover(explicit)
+    decoy = release / "explicit.png"
+    _make_cover(decoy)
+
+    resolved = common.resolve_cover_image_path(explicit, release)
+    assert resolved == explicit
+    assert resolved != decoy
+
+
+def test_resolve_cover_image_path_returns_input_when_neither_exists(
+    tmp_path: Path,
+) -> None:
+    """When neither location resolves, return the input unchanged so
+    the cover-image validator can surface a clean ``not found at <path>``
+    error (rather than this helper raising)."""
+
+    release = tmp_path / "release"
+    release.mkdir()
+    missing = tmp_path / "nope.png"
+
+    resolved = common.resolve_cover_image_path(missing, release)
+    assert resolved == missing
+    # Sanity: the validator's not-found path actually fires for this
+    # input.
+    errors = common.validate_cover_image(resolved)
+    assert errors
+    assert "not found" in errors[0].message