From ab35483f157c6718b5a3970f3b7127b26b08f5fe Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Sat, 30 May 2026 18:26:42 +0300
Subject: [PATCH] Improve dataset repo presentability

---
 .github/workflows/validate.yml |  22 ++++++
 CHANGELOG.md                   |  11 +++
 LICENSE                        |  21 ++++++
 README.md                      |  50 ++++++++++++--
 scripts/validate_release.py    | 119 +++++++++++++++++++++++++++++++++
 5 files changed, 219 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/validate.yml
 create mode 100644 LICENSE
 create mode 100644 scripts/validate_release.py

diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml
new file mode 100644
index 0000000..315737e
--- /dev/null
+++ b/.github/workflows/validate.yml
@@ -0,0 +1,22 @@
+name: Validate dataset release
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+jobs:
+  release-structure:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Validate release structure
+        run: python scripts/validate_release.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25e0435..0c18c7b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,17 @@ Format inspired by [Keep a Changelog](https://keepachangelog.com/).
 
 ---
 
+## Unreleased
+
+### Added
+
+- Root `LICENSE` file so GitHub and first-time users can detect the
+  repository's reuse terms without opening a release subdirectory.
+- README badges, license/reuse guidance, and a compact public CSV preview for
+  faster first-time orientation.
+- Lightweight GitHub Actions validation for the committed alpha release
+  structure and public flat-file contract.
+
 ## v0.1.0-alpha — 2026-05-05
 
 First populated release.  Intended for review and roadmap discussion by
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..acaffc3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 leadforge-dev
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 56dd29c..bfda4fb 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
 # leadforge-datasets
 
+[![Validate dataset release](https://github.com/leadforge-dev/leadforge-datasets/actions/workflows/validate.yml/badge.svg)](https://github.com/leadforge-dev/leadforge-datasets/actions/workflows/validate.yml)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
+
 Public dataset bundles produced by the [leadforge](https://github.com/leadforge-dev/leadforge)
 synthetic CRM/funnel data generator.
 
@@ -7,6 +10,11 @@ synthetic CRM/funnel data generator.
 > roadmap discussion, not production use.  Bundle layout, manifest schema,
 > and column sets may change before the first stable release.
 
+Use this repository when you want ready-made synthetic CRM and lead-scoring
+datasets for teaching, demos, model benchmarking, or release review.  Use
+[`leadforge`](https://github.com/leadforge-dev/leadforge) itself when you want
+to generate new worlds or recipes.
+
 ---
 
 ## Latest release
@@ -33,6 +41,37 @@ Pre-computed companion artifacts in [`releases/v0.1.0-alpha/`](releases/v0.1.0-a
 
 ---
 
+## Dataset preview
+
+The public flat CSV is intentionally small enough to inspect directly while
+still preserving a realistic relational CRM shape.  A few representative
+columns from `intro/lead_scoring.csv` look like this:
+
+| split | industry | region | employee_band | revenue_band | role_function | seniority | buyer_role | lead_source | touch_count | session_count | converted_90d |
+|---|---|---|---|---|---|---|---|---|---:|---:|---:|
+| train | logistics | UK | 200-499 | $50M-$200M | procurement_manager | vp | end_user | inbound_marketing | 9 | 3 | true |
+| train | logistics | UK | 500-999 | $10M-$50M | it_director | c_suite | technical_evaluator | inbound_marketing | 7 | 1 | true |
+| train | logistics | US | 200-499 | $1M-$10M | ap_manager | director | champion | partner_referral | 13 | 5 | true |
+
+The release also includes a relational version of the same world: accounts,
+contacts, leads, touches, sessions, sales activities, opportunities, customers,
+and subscriptions are available as Parquet tables under each bundle's
+`tables/` directory.
+
+---
+
+## License and reuse
+
+The repository-level license is [MIT](LICENSE).  The current release also keeps
+the same license text inside
+[`releases/v0.1.0-alpha/LICENSE`](releases/v0.1.0-alpha/LICENSE) so copied or
+downloaded release directories remain self-contained.
+
+The datasets are synthetic.  They are generated from a simulated commercial
+world, not from customer records or private CRM exports.
+
+---
+
 ## Quick start
 
 Inspect a bundle:
@@ -164,9 +203,11 @@ If you want to understand the package output in fifteen minutes:
 - **`tiny_demo` conversion rate is noisy.**  N=70 in the train split
   yields a wide CI; the 42.9% figure should be read as "consistent with
   the intro target of 41.5%, not a separate measurement."
-- **No CI on this repo.**  `validation.log` is captured by `build.sh` at
-  build time but not enforced on every push.  Acceptable for an alpha
-  with one publisher; revisit for v0.2.
+- **CI validates repository structure, not generator semantics.**  The
+  lightweight GitHub Actions workflow checks that the committed release has the
+  expected manifest, file layout, redaction, and sample flat-file contract.
+  Full semantic validation still comes from `leadforge validate` and the
+  captured [`validation.log`](releases/v0.1.0-alpha/validation.log).
 
 ---
 
@@ -177,5 +218,6 @@ If you want to understand the package output in fifteen minutes:
 - Recipe: `b2b_saas_procurement_v1` (mid-market B2B SaaS procurement vertical)
 - Seed: `42`
 - Bundle schema: `v4`
-- License: [MIT](releases/v0.1.0-alpha/LICENSE)
+- License: [MIT](LICENSE); release copy at
+  [`releases/v0.1.0-alpha/LICENSE`](releases/v0.1.0-alpha/LICENSE)
 - Full provenance: [`releases/v0.1.0-alpha/provenance.json`](releases/v0.1.0-alpha/provenance.json)
diff --git a/scripts/validate_release.py b/scripts/validate_release.py
new file mode 100644
index 0000000..06dcac2
--- /dev/null
+++ b/scripts/validate_release.py
@@ -0,0 +1,119 @@
+"""Validate the public leadforge dataset release structure.
+
+This intentionally uses only the Python standard library so the repository can
+run a lightweight CI check without installing the full generator stack.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+RELEASE = ROOT / "releases" / "v0.1.0-alpha"
+PUBLIC_BUNDLES = ("intro", "intermediate", "advanced", "tiny_demo")
+PUBLIC_FLAT_CSV_BUNDLES = ("intro", "intermediate", "advanced")
+INSTRUCTOR_BUNDLES = ("intermediate_instructor",)
+EXPECTED_TABLES = {
+    "accounts.parquet",
+    "contacts.parquet",
+    "customers.parquet",
+    "leads.parquet",
+    "opportunities.parquet",
+    "sales_activities.parquet",
+    "sessions.parquet",
+    "subscriptions.parquet",
+    "touches.parquet",
+}
+REQUIRED_FLAT_COLUMNS = {
+    "split",
+    "account_id",
+    "contact_id",
+    "lead_id",
+    "lead_created_at",
+    "touch_count",
+    "session_count",
+    "converted_within_90_days",
+}
+FORBIDDEN_PUBLIC_COLUMNS = {"current_stage", "is_sql"}
+
+
+def require(condition: bool, message: str) -> None:
+    if not condition:
+        raise AssertionError(message)
+
+
+def load_json(path: Path) -> dict:
+    require(path.exists(), f"Missing JSON file: {path}")
+    with path.open() as f:
+        return json.load(f)
+
+
+def validate_bundle(bundle: Path, *, instructor: bool = False) -> None:
+    manifest = load_json(bundle / "manifest.json")
+    require(manifest.get("bundle_schema_version") == "4", f"{bundle}: expected schema v4")
+    require(manifest.get("snapshot_day") == 30, f"{bundle}: expected snapshot_day=30")
+    require((bundle / "dataset_card.md").exists(), f"{bundle}: missing dataset_card.md")
+    require((bundle / "feature_dictionary.csv").exists(), f"{bundle}: missing feature_dictionary.csv")
+
+    table_dir = bundle / "tables"
+    require(table_dir.exists(), f"{bundle}: missing tables directory")
+    actual_tables = {path.name for path in table_dir.glob("*.parquet")}
+    require(actual_tables == EXPECTED_TABLES, f"{bundle}: unexpected table set: {sorted(actual_tables)}")
+
+    task_dir = bundle / "tasks" / "converted_within_90_days"
+    require(task_dir.exists(), f"{bundle}: missing converted_within_90_days task")
+    for split in ("train", "valid", "test"):
+        require((task_dir / f"{split}.parquet").exists(), f"{bundle}: missing {split}.parquet")
+    require((task_dir / "task_manifest.json").exists(), f"{bundle}: missing task_manifest.json")
+
+    if instructor:
+        metadata_dir = bundle / "metadata"
+        require(metadata_dir.exists(), f"{bundle}: missing instructor metadata")
+        for filename in (
+            "world_spec.json",
+            "graph.graphml",
+            "graph.json",
+            "mechanism_summary.json",
+            "latent_registry.json",
+        ):
+            require((metadata_dir / filename).exists(), f"{bundle}: missing metadata/{filename}")
+    elif bundle.name in PUBLIC_FLAT_CSV_BUNDLES:
+        flat_csv = bundle / "lead_scoring.csv"
+        require(flat_csv.exists(), f"{bundle}: missing lead_scoring.csv")
+        with flat_csv.open(newline="") as f:
+            reader = csv.DictReader(f)
+            columns = set(reader.fieldnames or [])
+            first_row = next(reader, None)
+        require(REQUIRED_FLAT_COLUMNS <= columns, f"{bundle}: missing required flat columns")
+        require(not (FORBIDDEN_PUBLIC_COLUMNS & columns), f"{bundle}: leaked redacted columns")
+        require(first_row is not None, f"{bundle}: empty lead_scoring.csv")
+        require(first_row.get("split") in {"train", "valid", "test"}, f"{bundle}: invalid split value")
+    else:
+        require(bundle.name == "tiny_demo", f"{bundle}: unexpected public bundle without flat CSV")
+
+
+def main() -> None:
+    require((ROOT / "LICENSE").exists(), "Repository root LICENSE is required")
+    require(RELEASE.exists(), f"Missing release directory: {RELEASE}")
+    for filename in (
+        "BASELINES.md",
+        "EXPOSURE_DELTA.md",
+        "LICENSE",
+        "build.sh",
+        "provenance.json",
+        "validation.log",
+        "baselines.py",
+    ):
+        require((RELEASE / filename).exists(), f"Missing release companion artifact: {filename}")
+    for bundle_name in PUBLIC_BUNDLES:
+        validate_bundle(RELEASE / bundle_name)
+    for bundle_name in INSTRUCTOR_BUNDLES:
+        validate_bundle(RELEASE / bundle_name, instructor=True)
+    print("leadforge-datasets release structure looks valid")
+
+
+if __name__ == "__main__":
+    main()