Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 25 additions & 16 deletions .agent-plan.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,47 @@

## Current System State

**v0.4.0 in progress — Milestones 7–9 complete (PR open).** Full simulation engine + render/bundle
layer + exposure filtering implemented. 545 tests passing.
**v0.4.0 complete — Milestones 7–10 done.** Full simulation engine + render/bundle layer + exposure filtering + CLI commands implemented. 562 tests passing.

---

## Next Up — Milestone 10: CLI `generate` command + `inspect` / `validate` stubs (v0.4.0)
## Next Up — Milestone 11: Validation harness (v0.5.0)

Goal: Wire `leadforge generate` CLI command end-to-end; implement `inspect` and `validate` output.
Goal: Implement comprehensive bundle validation — invariant checks, realism heuristics, difficulty drift detection.

- [ ] `cli/commands/generate.py` — parse flags, call `Generator.from_recipe().generate()`, call `.save()`
- [ ] `cli/commands/inspect.py` — print manifest summary for a written bundle
- [ ] `cli/commands/validate.py` — basic schema / FK / leakage checks on a written bundle
- [ ] Tests for each command
- [ ] `validation/invariants.py` — DAG acyclicity, FK integrity, determinism, exposure monotonicity
- [ ] `validation/artifact_checks.py` — file presence, hash verification, schema conformance
- [ ] `validation/realism.py` — distributional sanity checks (conversion rates, feature ranges)
- [ ] `validation/difficulty.py` — difficulty profile adherence checks
- [ ] `validation/drift.py` — cross-seed stability / drift detection
- [ ] Wire into `cli/commands/validate.py` with richer output
- [ ] Tests for each validation module

---

## Context Pointers

- Milestone 8 scope: `docs/leadforge_implementation_plan.md` §10 "Milestone 8"
- Render layer: `leadforge/render/` (snapshots, relational, tasks, manifests)
- Bundle writer: `leadforge/api/bundle.py`
- Milestone 11 scope: `docs/leadforge_implementation_plan.md` §10 "Milestone 11"
- Current validate CLI: `leadforge/cli/commands/validate.py` (basic checks implemented in M10)
- FK constraints: `leadforge/schema/relationships.py`
- Feature spec: `leadforge/schema/features.py`

---

## Completed Phases

### Milestone 9 — Exposure Filtering ✓ (v0.4.0 in PR)
- `exposure/filters.py`: `BundleFilter` frozen dataclass; `FILTERS` dict keyed by `ExposureMode`; `get_filter()`
- `exposure/redaction.py`: `write_metadata_dir()` — writes `metadata/` with `graph.json`, `graph.graphml`, `world_spec.json`, `latent_registry.json`, `mechanism_summary.json`
- `exposure/modes.py`: `apply_exposure(bundle, root, mode)` — dispatch; skips `metadata/` for `student_public`
### Milestone 10 — CLI Commands ✓ (v0.4.0)
- `cli/commands/generate.py`: fully wired — parses all flags, calls `Generator.from_recipe().generate()`, writes bundle via `.save()`
- `cli/commands/inspect.py`: reads `manifest.json` and prints summary (recipe, seed, mode, tables with row counts, task splits, metadata presence)
- `cli/commands/validate.py`: checks manifest presence, required files, table row counts, SHA-256 hashes, task split integrity, FK constraints, leakage (unexpected columns)
- 22 CLI tests (smoke, generate integration, inspect output, validate pass/fail/corrupt/missing); total 562 passing

### Milestone 9 — Exposure Filtering ✓ (v0.4.0)
- `exposure/filters.py`: `BundleFilter` frozen dataclass; `FILTERS` dict keyed by `ExposureMode`; `get_filter()` accepts `str | ExposureMode`
- `exposure/metadata.py`: `write_metadata_dir()` — writes `metadata/` with `graph.json`, `graph.graphml`, `world_spec.json`, `latent_registry.json`, `mechanism_summary.json`
- `exposure/modes.py`: `apply_exposure(bundle, root, mode)` — dispatch; removes stale `metadata/` for `student_public`
- Wired into `api/bundle.py` between dataset card and manifest steps
- 24 new tests; total 545 passing
- 22 exposure tests; total 547 passing

### Milestone 8 — Render / Bundle Layer ✓ (v0.4.0 in PR)
- `render/relational.py`: `to_dataframes()` — 9-table dict of typed DataFrames from SimulationResult + PopulationResult
Expand Down
61 changes: 56 additions & 5 deletions leadforge/cli/commands/generate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
"""leadforge generate command."""

from __future__ import annotations

from pathlib import Path

import typer

from leadforge.core.exceptions import LeadforgeError


def generate(
recipe: str = typer.Option(..., "--recipe", "-r", help="Recipe ID to use."),
Expand All @@ -28,8 +34,53 @@ def generate(
),
) -> None:
"""Generate a synthetic CRM dataset bundle from a recipe."""
typer.echo(
"The 'generate' command is not yet implemented. Coming in v0.2.0.",
err=True,
)
raise typer.Exit(1)
from leadforge.api.generator import Generator
from leadforge.core.serialization import load_yaml

override_dict: dict | None = None
if override is not None:
override_path = Path(override)
if not override_path.exists():
typer.echo(f"Error: override file not found: {override_path}", err=True)
raise typer.Exit(1)
try:
loaded = load_yaml(override_path)
except LeadforgeError as exc:
typer.echo(f"Error: {exc}", err=True)
raise typer.Exit(1) from None
if loaded is not None and not isinstance(loaded, dict):
typer.echo(
"Error: override file must contain a YAML mapping at the top level.",
err=True,
)
raise typer.Exit(1)
override_dict = loaded

try:
gen = Generator.from_recipe(
recipe,
seed=seed,
exposure_mode=mode,
difficulty=difficulty,
n_accounts=n_accounts,
n_contacts=n_contacts,
n_leads=n_leads,
horizon_days=horizon_days,
override=override_dict,
)
except (LeadforgeError, ValueError) as exc:
typer.echo(f"Error: {exc}", err=True)
raise typer.Exit(1) from None

typer.echo(f"Generating bundle with recipe '{recipe}', seed={seed}, mode={mode} ...")

try:
bundle = gen.generate()
except (LeadforgeError, RuntimeError) as exc:
typer.echo(f"Error during generation: {exc}", err=True)
raise typer.Exit(1) from None

typer.echo(f"Writing bundle to {out} ...")
bundle.save(out)

typer.echo(f"Done. Bundle written to {out}")
77 changes: 72 additions & 5 deletions leadforge/cli/commands/inspect.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,81 @@
"""leadforge inspect command."""

from __future__ import annotations

from pathlib import Path
from typing import Any

import typer

from leadforge.core.exceptions import LeadforgeError
from leadforge.core.serialization import load_json


def inspect(
bundle_path: str = typer.Argument(..., help="Path to a generated bundle directory."),
) -> None:
"""Inspect a generated dataset bundle and print a summary."""
typer.echo(
"The 'inspect' command is not yet implemented. Coming in v0.4.0.",
err=True,
)
raise typer.Exit(1)
root = Path(bundle_path)

if not root.exists():
typer.echo(f"Error: path does not exist: {root}", err=True)
raise typer.Exit(1)
if not root.is_dir():
typer.echo(f"Error: not a directory (expected a bundle dir): {root}", err=True)
raise typer.Exit(1)

manifest_path = root / "manifest.json"
if not manifest_path.exists():
typer.echo(f"Error: no manifest.json found in {root}", err=True)
raise typer.Exit(1)

try:
manifest = load_json(manifest_path)
except LeadforgeError as exc:
typer.echo(f"Error: {exc}", err=True)
raise typer.Exit(1) from None

if not isinstance(manifest, dict):
typer.echo("Error: manifest.json is not a JSON object", err=True)
raise typer.Exit(1)

Comment thread
shaypal5 marked this conversation as resolved.
typer.echo(f"Bundle: {root}")
typer.echo(f" Recipe: {manifest.get('recipe_id', '?')}")
typer.echo(f" Seed: {manifest.get('seed', '?')}")
typer.echo(f" Mode: {manifest.get('exposure_mode', '?')}")
typer.echo(f" Difficulty: {manifest.get('difficulty', '?')}")
typer.echo(f" Horizon days: {manifest.get('horizon_days', '?')}")
typer.echo(f" Generated at: {manifest.get('generation_timestamp', '?')}")
typer.echo(f" Package: leadforge {manifest.get('package_version', '?')}")
typer.echo(f" Schema ver: {manifest.get('bundle_schema_version', '?')}")
typer.echo(f" Motif family: {manifest.get('motif_family', '?')}")

typer.echo("")
typer.echo("Tables:")
tables = manifest.get("tables", {})
if isinstance(tables, dict):
for name, info in tables.items():
row_count = _safe_get(info, "row_count", "?")
typer.echo(f" {name:25s} {row_count:>8} rows")

Copilot AI Apr 29, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

row_count = _safe_get(info, 'row_count', '?') may return None (or another non-string) if the manifest is malformed, and f"{row_count:>8}" will raise TypeError for values like None. To keep inspect robust against partially-corrupt manifests, coerce to string before formatting (e.g., f"{str(row_count):>8}") or guard None explicitly.

Suggested change
typer.echo(f" {name:25s} {row_count:>8} rows")
typer.echo(f" {name:25s} {str(row_count):>8} rows")

Copilot uses AI. Check for mistakes.

tasks = manifest.get("tasks", {})
if isinstance(tasks, dict) and tasks:
typer.echo("")
typer.echo("Tasks:")
for task_id, info in tasks.items():
train = _safe_get(info, "train_rows", "?")
valid = _safe_get(info, "valid_rows", "?")
test = _safe_get(info, "test_rows", "?")
typer.echo(f" {task_id}")
typer.echo(f" train={train} valid={valid} test={test}")

has_metadata = (root / "metadata").is_dir()
typer.echo("")
typer.echo(f"Metadata dir: {'present' if has_metadata else 'absent'}")


def _safe_get(obj: Any, key: str, default: str = "?") -> Any:
"""Get a key from *obj* if it's a dict, else return *default*."""
if isinstance(obj, dict):
return obj.get(key, default)
return default
38 changes: 33 additions & 5 deletions leadforge/cli/commands/validate.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,42 @@
"""leadforge validate command."""

from __future__ import annotations

from pathlib import Path

import typer

from leadforge.core.exceptions import LeadforgeError


def validate(
bundle_path: str = typer.Argument(..., help="Path to a generated bundle directory."),
) -> None:
"""Run schema and artifact validation on a generated bundle."""
typer.echo(
"The 'validate' command is not yet implemented. Coming in v0.5.0.",
err=True,
)
raise typer.Exit(1)
from leadforge.validation.bundle_checks import validate_bundle

root = Path(bundle_path)

if not root.exists():
typer.echo(f"FAIL: path does not exist: {root}", err=True)
raise typer.Exit(1)
if not root.is_dir():
typer.echo(f"FAIL: not a directory: {root}", err=True)
raise typer.Exit(1)
if not (root / "manifest.json").exists():
typer.echo(f"FAIL: no manifest.json in {root}", err=True)
raise typer.Exit(1)

try:
errors = validate_bundle(root)
except LeadforgeError as exc:
typer.echo(f"FAIL: {exc}", err=True)
raise typer.Exit(1) from None

if errors:
typer.echo(f"FAIL: {len(errors)} validation error(s):", err=True)
for e in errors:
typer.echo(f" - {e}", err=True)
raise typer.Exit(1)

typer.echo(f"OK: bundle at {root} passed all checks.")
15 changes: 14 additions & 1 deletion leadforge/core/hashing.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""Deterministic config hashing for manifest identity.
"""Deterministic config hashing and file digest helpers.

A config hash uniquely identifies a (recipe, config, seed, version) tuple and
is embedded in every generated manifest so that bundles can be traced back to
the exact parameters that produced them.

:func:`file_sha256` provides a reusable SHA-256 file digest used by the
manifest builder and the bundle validator.
"""

import hashlib
import json
from dataclasses import asdict
from pathlib import Path
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
Expand All @@ -26,6 +30,15 @@ def _canonical(obj: Any) -> Any:
return obj


def file_sha256(path: Path) -> str:
"""Return the hex-encoded SHA-256 digest of the file at *path*."""
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(65536), b""):
h.update(chunk)
return h.hexdigest()


def hash_config(config: "GenerationConfig") -> str:
"""Return a stable hex-encoded SHA-256 digest of *config*.

Expand Down
16 changes: 4 additions & 12 deletions leadforge/render/manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

from __future__ import annotations

import hashlib
import json
from datetime import UTC, datetime
from pathlib import Path
from typing import TYPE_CHECKING, Any

from leadforge.core.hashing import file_sha256

if TYPE_CHECKING:
from leadforge.core.models import GenerationConfig
from leadforge.structure.graph import WorldGraph
Expand Down Expand Up @@ -55,7 +56,7 @@ def build_manifest(
for table_name, row_count in table_row_counts.items():
rel_path = f"tables/{table_name}.parquet"
abs_path = bundle_root / rel_path
sha = _sha256(abs_path)
sha = file_sha256(abs_path)
tables[table_name] = {"row_count": row_count, "file": rel_path, "sha256": sha}

# Build task entries.
Expand All @@ -65,7 +66,7 @@ def build_manifest(
for split_name, row_count in split_counts.items():
rel_path = f"tasks/{task_id}/{split_name}.parquet"
abs_path = bundle_root / rel_path
sha = _sha256(abs_path)
sha = file_sha256(abs_path)
entry[f"{split_name}_rows"] = row_count
entry[f"{split_name}_sha256"] = sha
tasks[task_id] = entry
Expand Down Expand Up @@ -93,12 +94,3 @@ def write_manifest(manifest: dict[str, Any], bundle_root: Path) -> Path:
path = bundle_root / "manifest.json"
path.write_text(json.dumps(manifest, indent=2))
return path


def _sha256(path: Path) -> str:
"""Return the hex-encoded SHA-256 digest of *path*."""
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(65536), b""):
h.update(chunk)
return h.hexdigest()
Loading
Loading