From ab35483f157c6718b5a3970f3b7127b26b08f5fe Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sat, 30 May 2026 18:26:42 +0300 Subject: [PATCH] Improve dataset repo presentability --- .github/workflows/validate.yml | 22 ++++++ CHANGELOG.md | 11 +++ LICENSE | 21 ++++++ README.md | 50 ++++++++++++-- scripts/validate_release.py | 119 +++++++++++++++++++++++++++++++++ 5 files changed, 219 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/validate.yml create mode 100644 LICENSE create mode 100644 scripts/validate_release.py diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 0000000..315737e --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,22 @@ +name: Validate dataset release + +on: + pull_request: + push: + branches: + - main + +jobs: + release-structure: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Validate release structure + run: python scripts/validate_release.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 25e0435..0c18c7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ Format inspired by [Keep a Changelog](https://keepachangelog.com/). --- +## Unreleased + +### Added + +- Root `LICENSE` file so GitHub and first-time users can detect the + repository's reuse terms without opening a release subdirectory. +- README badges, license/reuse guidance, and a compact public CSV preview for + faster first-time orientation. +- Lightweight GitHub Actions validation for the committed alpha release + structure and public flat-file contract. + ## v0.1.0-alpha — 2026-05-05 First populated release. Intended for review and roadmap discussion by diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..acaffc3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 leadforge-dev + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 56dd29c..bfda4fb 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # leadforge-datasets +[![Validate dataset release](https://github.com/leadforge-dev/leadforge-datasets/actions/workflows/validate.yml/badge.svg)](https://github.com/leadforge-dev/leadforge-datasets/actions/workflows/validate.yml) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) + Public dataset bundles produced by the [leadforge](https://github.com/leadforge-dev/leadforge) synthetic CRM/funnel data generator. @@ -7,6 +10,11 @@ synthetic CRM/funnel data generator. > roadmap discussion, not production use. Bundle layout, manifest schema, > and column sets may change before the first stable release. +Use this repository when you want ready-made synthetic CRM and lead-scoring +datasets for teaching, demos, model benchmarking, or release review. Use +[`leadforge`](https://github.com/leadforge-dev/leadforge) itself when you want +to generate new worlds or recipes. + --- ## Latest release @@ -33,6 +41,37 @@ Pre-computed companion artifacts in [`releases/v0.1.0-alpha/`](releases/v0.1.0-a --- +## Dataset preview + +The public flat CSV is intentionally small enough to inspect directly while +still preserving a realistic relational CRM shape. A few representative +columns from `intro/lead_scoring.csv` look like this: + +| split | industry | region | employee_band | revenue_band | role_function | seniority | buyer_role | lead_source | touch_count | session_count | converted_90d | +|---|---|---|---|---|---|---|---|---|---:|---:|---:| +| train | logistics | UK | 200-499 | $50M-$200M | procurement_manager | vp | end_user | inbound_marketing | 9 | 3 | true | +| train | logistics | UK | 500-999 | $10M-$50M | it_director | c_suite | technical_evaluator | inbound_marketing | 7 | 1 | true | +| train | logistics | US | 200-499 | $1M-$10M | ap_manager | director | champion | partner_referral | 13 | 5 | true | + +The release also includes a relational version of the same world: accounts, +contacts, leads, touches, sessions, sales activities, opportunities, customers, +and subscriptions are available as Parquet tables under each bundle's +`tables/` directory. + +--- + +## License and reuse + +The repository-level license is [MIT](LICENSE). The current release also keeps +the same license text inside +[`releases/v0.1.0-alpha/LICENSE`](releases/v0.1.0-alpha/LICENSE) so copied or +downloaded release directories remain self-contained. + +The datasets are synthetic. They are generated from a simulated commercial +world, not from customer records or private CRM exports. + +--- + ## Quick start Inspect a bundle: @@ -164,9 +203,11 @@ If you want to understand the package output in fifteen minutes: - **`tiny_demo` conversion rate is noisy.** N=70 in the train split yields a wide CI; the 42.9% figure should be read as "consistent with the intro target of 41.5%, not a separate measurement." -- **No CI on this repo.** `validation.log` is captured by `build.sh` at - build time but not enforced on every push. Acceptable for an alpha - with one publisher; revisit for v0.2. +- **CI validates repository structure, not generator semantics.** The + lightweight GitHub Actions workflow checks that the committed release has the + expected manifest, file layout, redaction, and sample flat-file contract. + Full semantic validation still comes from `leadforge validate` and the + captured [`validation.log`](releases/v0.1.0-alpha/validation.log). --- @@ -177,5 +218,6 @@ If you want to understand the package output in fifteen minutes: - Recipe: `b2b_saas_procurement_v1` (mid-market B2B SaaS procurement vertical) - Seed: `42` - Bundle schema: `v4` -- License: [MIT](releases/v0.1.0-alpha/LICENSE) +- License: [MIT](LICENSE); release copy at + [`releases/v0.1.0-alpha/LICENSE`](releases/v0.1.0-alpha/LICENSE) - Full provenance: [`releases/v0.1.0-alpha/provenance.json`](releases/v0.1.0-alpha/provenance.json) diff --git a/scripts/validate_release.py b/scripts/validate_release.py new file mode 100644 index 0000000..06dcac2 --- /dev/null +++ b/scripts/validate_release.py @@ -0,0 +1,119 @@ +"""Validate the public leadforge dataset release structure. + +This intentionally uses only the Python standard library so the repository can +run a lightweight CI check without installing the full generator stack. +""" + +from __future__ import annotations + +import csv +import json +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +RELEASE = ROOT / "releases" / "v0.1.0-alpha" +PUBLIC_BUNDLES = ("intro", "intermediate", "advanced", "tiny_demo") +PUBLIC_FLAT_CSV_BUNDLES = ("intro", "intermediate", "advanced") +INSTRUCTOR_BUNDLES = ("intermediate_instructor",) +EXPECTED_TABLES = { + "accounts.parquet", + "contacts.parquet", + "customers.parquet", + "leads.parquet", + "opportunities.parquet", + "sales_activities.parquet", + "sessions.parquet", + "subscriptions.parquet", + "touches.parquet", +} +REQUIRED_FLAT_COLUMNS = { + "split", + "account_id", + "contact_id", + "lead_id", + "lead_created_at", + "touch_count", + "session_count", + "converted_within_90_days", +} +FORBIDDEN_PUBLIC_COLUMNS = {"current_stage", "is_sql"} + + +def require(condition: bool, message: str) -> None: + if not condition: + raise AssertionError(message) + + +def load_json(path: Path) -> dict: + require(path.exists(), f"Missing JSON file: {path}") + with path.open() as f: + return json.load(f) + + +def validate_bundle(bundle: Path, *, instructor: bool = False) -> None: + manifest = load_json(bundle / "manifest.json") + require(manifest.get("bundle_schema_version") == "4", f"{bundle}: expected schema v4") + require(manifest.get("snapshot_day") == 30, f"{bundle}: expected snapshot_day=30") + require((bundle / "dataset_card.md").exists(), f"{bundle}: missing dataset_card.md") + require((bundle / "feature_dictionary.csv").exists(), f"{bundle}: missing feature_dictionary.csv") + + table_dir = bundle / "tables" + require(table_dir.exists(), f"{bundle}: missing tables directory") + actual_tables = {path.name for path in table_dir.glob("*.parquet")} + require(actual_tables == EXPECTED_TABLES, f"{bundle}: unexpected table set: {sorted(actual_tables)}") + + task_dir = bundle / "tasks" / "converted_within_90_days" + require(task_dir.exists(), f"{bundle}: missing converted_within_90_days task") + for split in ("train", "valid", "test"): + require((task_dir / f"{split}.parquet").exists(), f"{bundle}: missing {split}.parquet") + require((task_dir / "task_manifest.json").exists(), f"{bundle}: missing task_manifest.json") + + if instructor: + metadata_dir = bundle / "metadata" + require(metadata_dir.exists(), f"{bundle}: missing instructor metadata") + for filename in ( + "world_spec.json", + "graph.graphml", + "graph.json", + "mechanism_summary.json", + "latent_registry.json", + ): + require((metadata_dir / filename).exists(), f"{bundle}: missing metadata/{filename}") + elif bundle.name in PUBLIC_FLAT_CSV_BUNDLES: + flat_csv = bundle / "lead_scoring.csv" + require(flat_csv.exists(), f"{bundle}: missing lead_scoring.csv") + with flat_csv.open(newline="") as f: + reader = csv.DictReader(f) + columns = set(reader.fieldnames or []) + first_row = next(reader, None) + require(REQUIRED_FLAT_COLUMNS <= columns, f"{bundle}: missing required flat columns") + require(not (FORBIDDEN_PUBLIC_COLUMNS & columns), f"{bundle}: leaked redacted columns") + require(first_row is not None, f"{bundle}: empty lead_scoring.csv") + require(first_row.get("split") in {"train", "valid", "test"}, f"{bundle}: invalid split value") + else: + require(bundle.name == "tiny_demo", f"{bundle}: unexpected public bundle without flat CSV") + + +def main() -> None: + require((ROOT / "LICENSE").exists(), "Repository root LICENSE is required") + require(RELEASE.exists(), f"Missing release directory: {RELEASE}") + for filename in ( + "BASELINES.md", + "EXPOSURE_DELTA.md", + "LICENSE", + "build.sh", + "provenance.json", + "validation.log", + "baselines.py", + ): + require((RELEASE / filename).exists(), f"Missing release companion artifact: {filename}") + for bundle_name in PUBLIC_BUNDLES: + validate_bundle(RELEASE / bundle_name) + for bundle_name in INSTRUCTOR_BUNDLES: + validate_bundle(RELEASE / bundle_name, instructor=True) + print("leadforge-datasets release structure looks valid") + + +if __name__ == "__main__": + main()