From 4e821c738d0906ff17ca96129fef2ecc5a48b83b Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 08:19:02 +0300 Subject: [PATCH 1/3] refactor(schema): split lead-scoring schema into schemes/lead_scoring/ [LTV-Pg.2] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the M2 physical reorg (LTV-Pg). Pulls the lead-scoring-specific schema definitions out of the shared `schema/` package, leaving only genuine cross-scheme primitives behind. New files (schemes/lead_scoring/): - entities.py — ContactRow … SubscriptionRow + ALL_ROW_TYPES / TABLE_NAMES - relationships.py — ALL_CONSTRAINTS - features.py — LEAD_SNAPSHOT_FEATURES + redacted_columns_for - tasks.py — CONVERTED_WITHIN_90_DAYS + task_manifest_for_config Shared schema/ after the split (only primitives remain): - entities.py — EntityRowProtocol, make_empty_dataframe, AccountRow - features.py — FeatureSpec - relationships.py — FKConstraint, FKViolationError, validate_fk - tasks.py — SplitSpec, TaskManifest All callers updated (36 files): multi-line from-import blocks rewritten via perl; 4 mixed imports (tests/exposure/test_redaction.py, tests/schema/ test_relationships.py, tests/render/test_render.py, tests/narrative/ test_dataset_card.py) fixed manually where FeatureSpec/validate_fk/TaskManifest stay shared but co-imported with moved symbols. tests/schemes/test_module_layout.py: 3 new tests for Pg.2 — primitives-stay, scheme-specifics-in-scheme, removed-from-shared-schema. CHANGELOG, CLAUDE.md (canonical layout), roadmap (Pg.2 ✓), agent-plan updated. Verified byte-identical to pre-reorg main (14/14 files); full suite 1537 passed / 51 skipped; ruff + mypy clean (96 source files). Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 5 +- CLAUDE.md | 4 +- docs/ltv/roadmap.md | 2 +- leadforge/narrative/dataset_card.py | 2 +- leadforge/schema/dictionaries.py | 2 +- leadforge/schema/entities.py | 354 +----------------- leadforge/schema/features.py | 274 +------------- leadforge/schema/relationships.py | 35 +- leadforge/schema/tasks.py | 69 +--- leadforge/schemes/lead_scoring/__init__.py | 7 +- leadforge/schemes/lead_scoring/entities.py | 345 +++++++++++++++++ leadforge/schemes/lead_scoring/features.py | 271 ++++++++++++++ .../schemes/lead_scoring/relationships.py | 23 ++ .../schemes/lead_scoring/render/relational.py | 2 +- .../schemes/lead_scoring/render/snapshots.py | 4 +- .../schemes/lead_scoring/render/tasks.py | 2 +- .../schemes/lead_scoring/simulation/engine.py | 10 +- .../lead_scoring/simulation/population.py | 2 +- .../schemes/lead_scoring/simulation/state.py | 2 +- leadforge/schemes/lead_scoring/tasks.py | 63 ++++ leadforge/schemes/lifecycle/entities.py | 4 +- leadforge/validation/bundle_checks.py | 4 +- leadforge/validation/invariants.py | 2 +- leadforge/validation/realism.py | 4 +- leadforge/validation/release_quality.py | 2 +- tests/exposure/test_redaction.py | 5 +- tests/narrative/test_dataset_card.py | 3 +- tests/render/test_render.py | 7 +- tests/schema/test_entities.py | 4 +- tests/schema/test_features.py | 9 +- tests/schema/test_relationships.py | 2 +- tests/schema/test_tasks.py | 2 +- tests/schemes/lifecycle/test_entities.py | 4 +- tests/schemes/test_module_layout.py | 39 ++ tests/scripts/test_build_v7_snapshot.py | 10 +- tests/simulation/test_engine.py | 2 +- tests/test_primary_task_threading.py | 2 +- tests/validation/test_realism.py | 2 +- 38 files changed, 839 insertions(+), 746 deletions(-) create mode 100644 leadforge/schemes/lead_scoring/entities.py create mode 100644 leadforge/schemes/lead_scoring/features.py create mode 100644 leadforge/schemes/lead_scoring/relationships.py create mode 100644 leadforge/schemes/lead_scoring/tasks.py diff --git a/.agent-plan.md b/.agent-plan.md index 4d20020..1850a8d 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -48,8 +48,9 @@ snapshots/relational_snapshot_safe/tasks moved + relational.py split so the shared write_relational_tables stays in the envelope) merged (#110). `LTV-Pg` (schema reorg) split: Pg.1 (scaffold `schemes/lifecycle/` — lifecycle rows + registries + LIFECYCLE_CONSTRAINTS moved there, stub LifecycleScheme registered) -opened as **#111**; Pg.2 (split the lead-scoring schema out of shared `schema/`) -pending. All M2 moves byte-identical. Sibling `leadforge-datasets-private` +merged (#111); Pg.2 (split lead-scoring schema: entity rows/ALL_ROW_TYPES/ +ALL_CONSTRAINTS/LEAD_SNAPSHOT_FEATURES/CONVERTED_WITHIN_90_DAYS moved to +`schemes/lead_scoring/`; shared primitives stay in `schema/`) opened as **#NNN**. All M2 moves byte-identical. Sibling `leadforge-datasets-private` consumes bundle files, not internals — no lockstep update needed (heads-up issue #8). Next: `LTV-Pg.2`, then `LTV-Pc` (pLTV feature/task specs, authored in `schemes/lifecycle/`), then `LTV-M3` (lifecycle population). diff --git a/CLAUDE.md b/CLAUDE.md index 83f0ac1..d9b6464 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -156,7 +156,9 @@ leadforge/ cli/ main.py, commands/{generate,list_recipes,inspect,validate}.py core/ rng.py, ids.py, time.py, enums.py, models.py, exceptions.py, ... narrative/ spec.py, company.py, product.py, personas.py, market.py, funnel.py, dataset_card.py - schema/ entities.py, relationships.py, events.py, features.py, tasks.py, dictionaries.py + schema/ entities.py (EntityRowProtocol, make_empty_dataframe, AccountRow — shared primitives), + features.py (FeatureSpec), relationships.py (FKConstraint, validate_fk), + tasks.py (SplitSpec, TaskManifest), dictionaries.py, tables.py schemes/ base.py (GenerationScheme protocol + SCHEME_REGISTRY); lead_scoring/ — the lead-scoring scheme: __init__.py (build_world/ write_bundle) + simulation/, mechanisms/, structure/, render/ diff --git a/docs/ltv/roadmap.md b/docs/ltv/roadmap.md index 1476ba4..77b89e6 100644 --- a/docs/ltv/roadmap.md +++ b/docs/ltv/roadmap.md @@ -142,7 +142,7 @@ Total: ~19 PRs across 9 milestones. Shared primitives (`EntityRowProtocol`, `_empty_df`, `AccountRow`, `FKConstraint`) stay in `schema/` and are imported. Byte-identical; full suite green. (**PR #111**) - - [ ] **`LTV-Pg.2`** — split the **lead-scoring** schema: move the + - [x] **`LTV-Pg.2`** — split the **lead-scoring** schema (**PR #NNN**): move the lead-scoring entity rows / `ALL_ROW_TYPES` / `ALL_CONSTRAINTS` / `LEAD_SNAPSHOT_FEATURES` / task specs into `schemes/lead_scoring/`, leaving only genuinely shared primitives in `schema/`. (The lifecycle `LTV-Pc` diff --git a/leadforge/narrative/dataset_card.py b/leadforge/narrative/dataset_card.py index 7363119..a2d9dae 100644 --- a/leadforge/narrative/dataset_card.py +++ b/leadforge/narrative/dataset_card.py @@ -12,7 +12,7 @@ from collections import Counter from typing import TYPE_CHECKING -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec if TYPE_CHECKING: from leadforge.core.models import WorldSpec diff --git a/leadforge/schema/dictionaries.py b/leadforge/schema/dictionaries.py index 9f1b32b..fc46b30 100644 --- a/leadforge/schema/dictionaries.py +++ b/leadforge/schema/dictionaries.py @@ -11,7 +11,7 @@ import pandas as pd -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec _COLUMNS = ("name", "dtype", "description", "category", "is_target", "leakage_risk") diff --git a/leadforge/schema/entities.py b/leadforge/schema/entities.py index 21ebe42..c599311 100644 --- a/leadforge/schema/entities.py +++ b/leadforge/schema/entities.py @@ -1,18 +1,20 @@ -"""Typed row contracts for all v1 relational tables. +"""Shared entity-row primitives. -Each class represents one row in a Parquet table. Fields map directly to -the column specifications in §16 of the architecture spec. Optional columns -(nullable in the output) use ``... | None`` typing. +This module exposes only the **scheme-agnostic** building blocks: -All row classes expose: +- :class:`EntityRowProtocol` — the structural protocol all entity row + dataclasses satisfy. +- :func:`make_empty_dataframe` — construct a zero-row DataFrame with the right + column types from a ``DTYPE_MAP``. +- :class:`AccountRow` — the ``accounts`` entity, shared between the + lead-scoring and lifecycle schemes (accounts are the same real-world entity + in both). -- ``TABLE_NAME`` — the canonical Parquet table name (no extension). -- ``DTYPE_MAP`` — ``{column: pandas-dtype-string}`` used to build empty - DataFrames with the right schema. -- ``to_dict()`` — returns a plain ``dict`` suitable for ``pd.DataFrame([...])`` - or JSON serialization. -- ``empty_dataframe()`` — class method returning a zero-row ``pd.DataFrame`` - with the correct columns and nullable dtypes. +Lead-scoring entity rows (``ContactRow``, ``LeadRow``, …) and the lead-scoring +catalog (``ALL_ROW_TYPES``, ``TABLE_NAMES``) live in +:mod:`leadforge.schemes.lead_scoring.entities`. + +Lifecycle entity rows live in :mod:`leadforge.schemes.lifecycle.entities`. """ from __future__ import annotations @@ -44,7 +46,7 @@ def make_empty_dataframe(dtype_map: dict[str, str]) -> pd.DataFrame: # --------------------------------------------------------------------------- -# accounts +# accounts (shared entity — present in both lead-scoring and lifecycle bundles) # --------------------------------------------------------------------------- @@ -79,329 +81,3 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# contacts -# --------------------------------------------------------------------------- - - -@dataclass -class ContactRow: - """One row in the ``contacts`` table.""" - - TABLE_NAME: ClassVar[str] = "contacts" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "contact_id": "string", - "account_id": "string", - "job_title": "string", - "role_function": "string", - "seniority": "string", - "buyer_role": "string", - "email_domain_type": "string", - "created_at": "string", - } - - contact_id: str - account_id: str - job_title: str - role_function: str - seniority: str - buyer_role: str - email_domain_type: str - created_at: str - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# leads -# --------------------------------------------------------------------------- - - -@dataclass -class LeadRow: - """One row in the ``leads`` table. - - .. note:: The ``converted_within_90_days`` field name is retained for - schema stability, but its value is derived using - ``GenerationConfig.label_window_days`` (which defaults to 90). A - lead is marked ``True`` only if its conversion event occurred before - ``label_window_days`` from lead creation — **not** necessarily within - the full ``horizon_days`` simulation window. - - Consequently, ``conversion_timestamp`` may be set (non-``None``) - while ``converted_within_90_days`` is ``False``, indicating the lead - converted after the label observation window closed. - """ - - TABLE_NAME: ClassVar[str] = "leads" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "lead_id": "string", - "contact_id": "string", - "account_id": "string", - "lead_created_at": "string", - "lead_source": "string", - "first_touch_channel": "string", - "current_stage": "string", - "owner_rep_id": "string", - "is_sql": "boolean", - "converted_within_90_days": "boolean", - "conversion_timestamp": "string", - } - - # ``is_mql`` was removed in bundle schema v3 (issue #57). Every lead - # is initialised at MQL stage in ``simulation/population.py``, so the - # field was constant ``True`` and zero-variance across all bundles. - - lead_id: str - contact_id: str - account_id: str - lead_created_at: str - lead_source: str - first_touch_channel: str - current_stage: str - owner_rep_id: str - is_sql: bool - converted_within_90_days: bool - conversion_timestamp: str | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# touches -# --------------------------------------------------------------------------- - - -@dataclass -class TouchRow: - """One row in the ``touches`` table.""" - - TABLE_NAME: ClassVar[str] = "touches" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "touch_id": "string", - "lead_id": "string", - "touch_timestamp": "string", - "touch_type": "string", - "touch_channel": "string", - "touch_direction": "string", - "campaign_id": "string", - } - - touch_id: str - lead_id: str - touch_timestamp: str - touch_type: str - touch_channel: str - touch_direction: str - campaign_id: str | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# sessions -# --------------------------------------------------------------------------- - - -@dataclass -class SessionRow: - """One row in the ``sessions`` table.""" - - TABLE_NAME: ClassVar[str] = "sessions" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "session_id": "string", - "lead_id": "string", - "session_timestamp": "string", - "session_type": "string", - "page_views": "Int64", - "pricing_page_views": "Int64", - "demo_page_views": "Int64", - "session_duration_seconds": "Int64", - } - - session_id: str - lead_id: str - session_timestamp: str - session_type: str - page_views: int - pricing_page_views: int - demo_page_views: int - session_duration_seconds: int - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# sales_activities -# --------------------------------------------------------------------------- - - -@dataclass -class SalesActivityRow: - """One row in the ``sales_activities`` table.""" - - TABLE_NAME: ClassVar[str] = "sales_activities" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "activity_id": "string", - "lead_id": "string", - "rep_id": "string", - "activity_timestamp": "string", - "activity_type": "string", - "activity_outcome": "string", - } - - activity_id: str - lead_id: str - rep_id: str - activity_timestamp: str - activity_type: str - activity_outcome: str - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# opportunities -# --------------------------------------------------------------------------- - - -@dataclass -class OpportunityRow: - """One row in the ``opportunities`` table.""" - - TABLE_NAME: ClassVar[str] = "opportunities" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "opportunity_id": "string", - "lead_id": "string", - "created_at": "string", - "stage": "string", - "estimated_acv": "Int64", - "close_outcome": "string", - "closed_at": "string", - } - - opportunity_id: str - lead_id: str - created_at: str - stage: str - estimated_acv: int - close_outcome: str | None = None - closed_at: str | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# customers -# --------------------------------------------------------------------------- - - -@dataclass -class CustomerRow: - """One row in the ``customers`` table.""" - - TABLE_NAME: ClassVar[str] = "customers" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "customer_id": "string", - "opportunity_id": "string", - "account_id": "string", - "customer_start_at": "string", - } - - customer_id: str - opportunity_id: str - account_id: str - customer_start_at: str - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# subscriptions -# --------------------------------------------------------------------------- - - -@dataclass -class SubscriptionRow: - """One row in the ``subscriptions`` table.""" - - TABLE_NAME: ClassVar[str] = "subscriptions" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "subscription_id": "string", - "customer_id": "string", - "plan_name": "string", - "subscription_start_at": "string", - "subscription_status": "string", - } - - subscription_id: str - customer_id: str - plan_name: str - subscription_start_at: str - subscription_status: str - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return make_empty_dataframe(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# Registry (lead-scoring catalog) -# --------------------------------------------------------------------------- -# Lifecycle (b2b_saas_ltv_v1) entity rows + their LIFECYCLE_ROW_TYPES registry -# now live in ``leadforge.schemes.lifecycle.entities`` (moved in LTV-Pg). -# ``AccountRow`` above is shared and reused by the lifecycle scheme. - -ALL_ROW_TYPES: tuple[type[EntityRowProtocol], ...] = ( - AccountRow, - ContactRow, - LeadRow, - TouchRow, - SessionRow, - SalesActivityRow, - OpportunityRow, - CustomerRow, - SubscriptionRow, -) - -TABLE_NAMES: tuple[str, ...] = tuple(cls.TABLE_NAME for cls in ALL_ROW_TYPES) diff --git a/leadforge/schema/features.py b/leadforge/schema/features.py index 1f29969..6143341 100644 --- a/leadforge/schema/features.py +++ b/leadforge/schema/features.py @@ -1,9 +1,9 @@ -"""Feature specification for the lead snapshot task table. +"""Shared feature-spec primitive. -:data:`LEAD_SNAPSHOT_FEATURES` is the canonical ordered list of features -present in the primary task export (``tasks/converted_within_90_days/``). -Every feature here is anchored at or before the snapshot date — no -post-anchor data is included (leakage rule, §4 of the architecture spec). +:class:`FeatureSpec` is a scheme-agnostic dataclass used by every scheme's +feature catalog. The lead-scoring catalog +(:data:`~leadforge.schemes.lead_scoring.features.LEAD_SNAPSHOT_FEATURES`) +lives in :mod:`leadforge.schemes.lead_scoring.features`. """ from __future__ import annotations @@ -15,7 +15,7 @@ @dataclass(frozen=True) class FeatureSpec: - """Metadata for one column in the lead snapshot table. + """Metadata for one column in a scheme's snapshot table. Two concerns are kept deliberately separate: @@ -60,265 +60,3 @@ class FeatureSpec: leakage_risk: bool = False non_negative: bool = False redact_in_modes: frozenset[ExposureMode] = field(default_factory=frozenset) - - -# --------------------------------------------------------------------------- -# Canonical feature list — lead snapshot -# --------------------------------------------------------------------------- - -LEAD_SNAPSHOT_FEATURES: tuple[FeatureSpec, ...] = ( - # -- Account features -- - FeatureSpec("account_id", "string", "Opaque account identifier.", "account"), - FeatureSpec( - "industry", - "string", - "Industry vertical of the buying organization.", - "account", - ), - FeatureSpec( - "region", - "string", - "Geographic region of the account's headquarters.", - "account", - ), - FeatureSpec( - "employee_band", - "string", - "Banded employee headcount of the account.", - "account", - ), - FeatureSpec( - "estimated_revenue_band", - "string", - "Banded estimated annual revenue of the account.", - "account", - ), - FeatureSpec( - "process_maturity_band", - "string", - "Banded internal process maturity score (latent).", - "account", - leakage_risk=False, - ), - # -- Contact features -- - FeatureSpec("contact_id", "string", "Opaque contact identifier.", "contact"), - FeatureSpec( - "role_function", - "string", - "Functional area of the primary contact (e.g. finance, ops).", - "contact", - ), - FeatureSpec( - "seniority", - "string", - "Seniority band of the primary contact.", - "contact", - ), - FeatureSpec( - "buyer_role", - "string", - "Buyer role classification (economic_buyer, champion, etc.).", - "contact", - ), - # -- Lead metadata features -- - FeatureSpec("lead_id", "string", "Opaque lead identifier.", "lead_meta"), - FeatureSpec( - "lead_created_at", - "string", - "ISO-8601 timestamp when the lead was created.", - "lead_meta", - ), - FeatureSpec( - "lead_source", - "string", - "Origination source of the lead (e.g. inbound_form, sdr_outbound).", - "lead_meta", - ), - # Note: ``first_touch_channel`` is absent from this list. In v1 the - # simulation sets it to the same value as ``lead_source`` (both derive - # from the channel drawn during lead creation), making it byte-identical - # and zero-information. It is retained in the relational ``leads`` - # table for completeness; it is excluded from the flat snapshot because - # a duplicate column would be actively misleading in a teaching dataset. - FeatureSpec( - "current_stage", - "string", - "Funnel stage at snapshot anchor date. WARNING: at full-horizon " - "(90-day) snapshots this contains terminal stages (closed_won / " - "closed_lost) that encode the label. Exclude from modeling or use " - "a windowed snapshot.", - "lead_meta", - leakage_risk=True, - redact_in_modes=frozenset({ExposureMode.student_public}), - ), - # Note: ``is_mql`` was removed from the canonical feature list (issue #57) - # because every lead is initialised at MQL stage in - # ``leadforge/simulation/population.py``, making the column constant - # ``True`` and zero-variance. The underlying ``LeadRow.is_mql`` field - # still lives on the relational ``leads.parquet`` table. - FeatureSpec( - "is_sql", - "boolean", - "Whether the lead had achieved SQL status at snapshot date. " - "Strongly correlated with the label: the simulator only converts " - "non-SQL leads via a rare direct-conversion path, so " - "is_sql=False predicts non-conversion with very high probability " - "(P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across difficulty " - "tiers). Redacted from student_public bundles.", - "lead_meta", - leakage_risk=True, - redact_in_modes=frozenset({ExposureMode.student_public}), - ), - # -- Engagement features -- - FeatureSpec( - "touch_count", - "Int64", - "Total number of marketing/sales touches recorded before snapshot.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "inbound_touch_count", - "Int64", - "Number of inbound touches before snapshot.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "outbound_touch_count", - "Int64", - "Number of outbound touches before snapshot.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "session_count", - "Int64", - "Number of web/trial sessions recorded before snapshot.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "pricing_page_views", - "Int64", - "Cumulative pricing page views across all sessions before snapshot.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "demo_page_views", - "Int64", - "Cumulative demo page views across all sessions before snapshot.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "total_session_duration_seconds", - "Int64", - "Sum of session durations (seconds) before snapshot.", - "engagement", - non_negative=True, - ), - # -- Momentum features -- - FeatureSpec( - "touches_days_0_7", - "Int64", - "Number of touches in days 0–7 (inclusive) after lead creation.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "touches_last_7_days", - "Int64", - "Number of touches in the last 7 days before snapshot cutoff.", - "engagement", - non_negative=True, - ), - FeatureSpec( - "days_since_first_touch", - "Float64", - "Days between first touch and snapshot cutoff (NaN if no touches).", - "engagement", - non_negative=True, - ), - # -- Sales activity features -- - FeatureSpec( - "activity_count", - "Int64", - "Number of sales activities logged before snapshot.", - "sales", - non_negative=True, - ), - FeatureSpec( - "days_since_last_touch", - "Float64", - "Days elapsed between most recent touch and snapshot cutoff.", - "sales", - non_negative=True, - ), - FeatureSpec( - "opportunity_created", - "boolean", - "Whether any opportunity was created by snapshot date (open or closed).", - "sales", - ), - FeatureSpec( - "has_open_opportunity", - "boolean", - "Whether an open opportunity existed at snapshot date.", - "sales", - ), - FeatureSpec( - "opportunity_estimated_acv", - "Float64", - "Estimated ACV of the most recent open opportunity (NaN if none).", - "sales", - non_negative=True, - ), - FeatureSpec( - "expected_acv", - "Float64", - "Expected ACV: opportunity ACV if available by snapshot, else " - "revenue band midpoint heuristic (NaN if neither available).", - "sales", - non_negative=True, - ), - # -- Pedagogical leakage trap (deliberately retained in all modes) -- - FeatureSpec( - "total_touches_all", - "Int64", - "Total touches over full 90-day window. LEAKAGE TRAP: uses " - "post-snapshot data. Included for pedagogical purposes only.", - "engagement", - leakage_risk=True, - ), - # -- Target -- - FeatureSpec( - "converted_within_90_days", - "boolean", - "Label: True if a closed_won event occurred within 90 days of " - "the snapshot anchor date. Derived from simulated events.", - "target", - is_target=True, - ), -) - - -def redacted_columns_for( - mode: ExposureMode, - features: tuple[FeatureSpec, ...] = LEAD_SNAPSHOT_FEATURES, -) -> frozenset[str]: - """Return the set of column names that must be stripped from *mode* exports. - - The redaction policy is encoded per-feature in - :attr:`FeatureSpec.redact_in_modes`. Callers (the bundle writer, the - validation check) all derive their answer from this single function, so - a single source of truth governs both producing and verifying bundles. - - Args: - mode: The exposure mode being published. - features: Feature spec tuple to consult. Defaults to the canonical - lead snapshot list; callable with a custom tuple for tests or - future per-recipe feature sets. - """ - return frozenset(f.name for f in features if mode in f.redact_in_modes) diff --git a/leadforge/schema/relationships.py b/leadforge/schema/relationships.py index 4ec501c..46c69ec 100644 --- a/leadforge/schema/relationships.py +++ b/leadforge/schema/relationships.py @@ -1,8 +1,15 @@ -"""Foreign-key relationship definitions and validation helpers. +"""Shared FK-constraint primitives. -Describes the canonical FK graph for the v1 relational model and provides -:func:`validate_fk` to assert referential integrity on a collection of rows -before they are written to Parquet. +:class:`FKConstraint`, :class:`FKViolationError`, and :func:`validate_fk` +are scheme-agnostic utilities; both the lead-scoring and lifecycle schemes +build their FK catalogs (``ALL_CONSTRAINTS``, ``LIFECYCLE_CONSTRAINTS``) with +them. + +Lead-scoring ``ALL_CONSTRAINTS`` lives in +:mod:`leadforge.schemes.lead_scoring.relationships`. + +Lifecycle ``LIFECYCLE_CONSTRAINTS`` lives in +:mod:`leadforge.schemes.lifecycle.relationships`. """ from __future__ import annotations @@ -26,26 +33,6 @@ class FKConstraint: parent_column: str -# All v1 FK constraints, derived from §9.2 of the architecture spec. -ALL_CONSTRAINTS: tuple[FKConstraint, ...] = ( - FKConstraint("contacts", "account_id", "accounts", "account_id"), - FKConstraint("leads", "account_id", "accounts", "account_id"), - FKConstraint("leads", "contact_id", "contacts", "contact_id"), - FKConstraint("touches", "lead_id", "leads", "lead_id"), - FKConstraint("sessions", "lead_id", "leads", "lead_id"), - FKConstraint("sales_activities", "lead_id", "leads", "lead_id"), - FKConstraint("opportunities", "lead_id", "leads", "lead_id"), - FKConstraint("customers", "opportunity_id", "opportunities", "opportunity_id"), - FKConstraint("customers", "account_id", "accounts", "account_id"), - FKConstraint("subscriptions", "customer_id", "customers", "customer_id"), -) - - -# Lifecycle (b2b_saas_ltv_v1) FK constraints (LIFECYCLE_CONSTRAINTS) now live in -# ``leadforge.schemes.lifecycle.relationships`` (moved in LTV-Pg). They reuse -# the shared FKConstraint primitive above. - - def validate_fk( child_values: list[str], parent_values: set[str], diff --git a/leadforge/schema/tasks.py b/leadforge/schema/tasks.py index b86bb34..0a2d6ef 100644 --- a/leadforge/schema/tasks.py +++ b/leadforge/schema/tasks.py @@ -1,13 +1,15 @@ -"""Task manifest definition for the primary v1 classification task. +"""Shared task-manifest primitives. -A :class:`TaskManifest` describes everything needed to reconstruct the task -from the output bundle: the label column, the time window, the split ratios, -and the table it lives in. +:class:`SplitSpec` and :class:`TaskManifest` are scheme-agnostic types used by +every scheme's task definition. The lead-scoring task definition +(:data:`~leadforge.schemes.lead_scoring.tasks.CONVERTED_WITHIN_90_DAYS`, +:func:`~leadforge.schemes.lead_scoring.tasks.task_manifest_for_config`) lives +in :mod:`leadforge.schemes.lead_scoring.tasks`. """ from __future__ import annotations -from dataclasses import dataclass, replace +from dataclasses import dataclass @dataclass(frozen=True) @@ -86,60 +88,3 @@ def to_dict(self) -> dict[str, object]: }, "description": self.description, } - - -# --------------------------------------------------------------------------- -# v1 task definition -# --------------------------------------------------------------------------- - -CONVERTED_WITHIN_90_DAYS: TaskManifest = TaskManifest( - task_id="converted_within_90_days", - label_column="converted_within_90_days", - label_window_days=90, - primary_table="leads", - split=SplitSpec(train=0.7, valid=0.15, test=0.15), - task_type="binary_classification", - description=( - "A lead is considered converted if a `closed_won` event is recorded " - "within 90 days of the lead's snapshot anchor date. The label is " - "event-derived — never sampled directly. All features are pre-anchor " - "(leakage-free by construction)." - ), -) - - -def task_manifest_for_config( - primary_task: str = CONVERTED_WITHIN_90_DAYS.task_id, - label_window_days: int = CONVERTED_WITHIN_90_DAYS.label_window_days, -) -> TaskManifest: - """Build a :class:`TaskManifest` from generation config fields. - - Derives from :data:`CONVERTED_WITHIN_90_DAYS` via ``dataclasses.replace``, - overriding only the fields that vary. When *primary_task* and - *label_window_days* match the defaults, this returns an equivalent manifest. - - Args: - primary_task: Task identifier — used as the task directory name and - manifest key. - label_window_days: Label observation window in days. - """ - if primary_task == CONVERTED_WITHIN_90_DAYS.task_id: - description = ( - f"A lead is considered converted if a `closed_won` event is recorded " - f"within {label_window_days} days of the lead's snapshot anchor date. " - f"The label is event-derived — never sampled directly. All features " - f"are pre-anchor (leakage-free by construction)." - ) - else: - description = ( - f"Binary label `{primary_task}` evaluated over a " - f"{label_window_days}-day window from the snapshot anchor date. " - f"The label is event-derived — never sampled directly. All features " - f"are pre-anchor (leakage-free by construction)." - ) - return replace( - CONVERTED_WITHIN_90_DAYS, - task_id=primary_task, - label_window_days=label_window_days, - description=description, - ) diff --git a/leadforge/schemes/lead_scoring/__init__.py b/leadforge/schemes/lead_scoring/__init__.py index e8a7dec..b6249c2 100644 --- a/leadforge/schemes/lead_scoring/__init__.py +++ b/leadforge/schemes/lead_scoring/__init__.py @@ -164,14 +164,17 @@ def write_bundle( from leadforge.render.manifests import build_manifest, write_manifest from leadforge.render.relational_io import write_relational_tables from leadforge.schema.dictionaries import write_feature_dictionary - from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES, redacted_columns_for - from leadforge.schema.tasks import task_manifest_for_config + from leadforge.schemes.lead_scoring.features import ( + LEAD_SNAPSHOT_FEATURES, + redacted_columns_for, + ) from leadforge.schemes.lead_scoring.render.relational import to_dataframes from leadforge.schemes.lead_scoring.render.relational_snapshot_safe import ( to_dataframes_snapshot_safe, ) from leadforge.schemes.lead_scoring.render.snapshots import build_snapshot from leadforge.schemes.lead_scoring.render.tasks import write_task_splits + from leadforge.schemes.lead_scoring.tasks import task_manifest_for_config if ( bundle.simulation_result is None diff --git a/leadforge/schemes/lead_scoring/entities.py b/leadforge/schemes/lead_scoring/entities.py new file mode 100644 index 0000000..8ad6b60 --- /dev/null +++ b/leadforge/schemes/lead_scoring/entities.py @@ -0,0 +1,345 @@ +"""Lead-scoring (``b2b_saas_procurement_v1``) entity row contracts. + +The 8 lead-scoring-specific entity row classes and the ``ALL_ROW_TYPES`` / +``TABLE_NAMES`` catalog live here. They are distinct from the lifecycle rows +in :mod:`leadforge.schemes.lifecycle.entities`. + +``AccountRow`` and the shared primitives (``EntityRowProtocol``, +``make_empty_dataframe``) remain in :mod:`leadforge.schema.entities`; both +schemes import them from there. +""" + +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import Any, ClassVar + +import pandas as pd + +from leadforge.schema.entities import ( + AccountRow, + EntityRowProtocol, + make_empty_dataframe, +) + +# --------------------------------------------------------------------------- +# contacts +# --------------------------------------------------------------------------- + + +@dataclass +class ContactRow: + """One row in the ``contacts`` table.""" + + TABLE_NAME: ClassVar[str] = "contacts" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "contact_id": "string", + "account_id": "string", + "job_title": "string", + "role_function": "string", + "seniority": "string", + "buyer_role": "string", + "email_domain_type": "string", + "created_at": "string", + } + + contact_id: str + account_id: str + job_title: str + role_function: str + seniority: str + buyer_role: str + email_domain_type: str + created_at: str + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# leads +# --------------------------------------------------------------------------- + + +@dataclass +class LeadRow: + """One row in the ``leads`` table. + + .. note:: The ``converted_within_90_days`` field name is retained for + schema stability, but its value is derived using + ``GenerationConfig.label_window_days`` (which defaults to 90). A + lead is marked ``True`` only if its conversion event occurred before + ``label_window_days`` from lead creation — **not** necessarily within + the full ``horizon_days`` simulation window. + + Consequently, ``conversion_timestamp`` may be set (non-``None``) + while ``converted_within_90_days`` is ``False``, indicating the lead + converted after the label observation window closed. + """ + + TABLE_NAME: ClassVar[str] = "leads" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "lead_id": "string", + "contact_id": "string", + "account_id": "string", + "lead_created_at": "string", + "lead_source": "string", + "first_touch_channel": "string", + "current_stage": "string", + "owner_rep_id": "string", + "is_sql": "boolean", + "converted_within_90_days": "boolean", + "conversion_timestamp": "string", + } + + # ``is_mql`` was removed in bundle schema v3 (issue #57). Every lead + # is initialised at MQL stage in ``simulation/population.py``, so the + # field was constant ``True`` and zero-variance across all bundles. + + lead_id: str + contact_id: str + account_id: str + lead_created_at: str + lead_source: str + first_touch_channel: str + current_stage: str + owner_rep_id: str + is_sql: bool + converted_within_90_days: bool + conversion_timestamp: str | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# touches +# --------------------------------------------------------------------------- + + +@dataclass +class TouchRow: + """One row in the ``touches`` table.""" + + TABLE_NAME: ClassVar[str] = "touches" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "touch_id": "string", + "lead_id": "string", + "touch_timestamp": "string", + "touch_type": "string", + "touch_channel": "string", + "touch_direction": "string", + "campaign_id": "string", + } + + touch_id: str + lead_id: str + touch_timestamp: str + touch_type: str + touch_channel: str + touch_direction: str + campaign_id: str | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# sessions +# --------------------------------------------------------------------------- + + +@dataclass +class SessionRow: + """One row in the ``sessions`` table.""" + + TABLE_NAME: ClassVar[str] = "sessions" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "session_id": "string", + "lead_id": "string", + "session_timestamp": "string", + "session_type": "string", + "page_views": "Int64", + "pricing_page_views": "Int64", + "demo_page_views": "Int64", + "session_duration_seconds": "Int64", + } + + session_id: str + lead_id: str + session_timestamp: str + session_type: str + page_views: int + pricing_page_views: int + demo_page_views: int + session_duration_seconds: int + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# sales_activities +# --------------------------------------------------------------------------- + + +@dataclass +class SalesActivityRow: + """One row in the ``sales_activities`` table.""" + + TABLE_NAME: ClassVar[str] = "sales_activities" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "activity_id": "string", + "lead_id": "string", + "rep_id": "string", + "activity_timestamp": "string", + "activity_type": "string", + "activity_outcome": "string", + } + + activity_id: str + lead_id: str + rep_id: str + activity_timestamp: str + activity_type: str + activity_outcome: str + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# opportunities +# --------------------------------------------------------------------------- + + +@dataclass +class OpportunityRow: + """One row in the ``opportunities`` table.""" + + TABLE_NAME: ClassVar[str] = "opportunities" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "opportunity_id": "string", + "lead_id": "string", + "created_at": "string", + "stage": "string", + "estimated_acv": "Int64", + "close_outcome": "string", + "closed_at": "string", + } + + opportunity_id: str + lead_id: str + created_at: str + stage: str + estimated_acv: int + close_outcome: str | None = None + closed_at: str | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# customers +# --------------------------------------------------------------------------- + + +@dataclass +class CustomerRow: + """One row in the ``customers`` table.""" + + TABLE_NAME: ClassVar[str] = "customers" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "customer_id": "string", + "opportunity_id": "string", + "account_id": "string", + "customer_start_at": "string", + } + + customer_id: str + opportunity_id: str + account_id: str + customer_start_at: str + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# subscriptions +# --------------------------------------------------------------------------- + + +@dataclass +class SubscriptionRow: + """One row in the ``subscriptions`` table.""" + + TABLE_NAME: ClassVar[str] = "subscriptions" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "subscription_id": "string", + "customer_id": "string", + "plan_name": "string", + "subscription_start_at": "string", + "subscription_status": "string", + } + + subscription_id: str + customer_id: str + plan_name: str + subscription_start_at: str + subscription_status: str + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# Lead-scoring catalog +# --------------------------------------------------------------------------- + +ALL_ROW_TYPES: tuple[type[EntityRowProtocol], ...] = ( + AccountRow, + ContactRow, + LeadRow, + TouchRow, + SessionRow, + SalesActivityRow, + OpportunityRow, + CustomerRow, + SubscriptionRow, +) + +TABLE_NAMES: tuple[str, ...] = tuple(cls.TABLE_NAME for cls in ALL_ROW_TYPES) diff --git a/leadforge/schemes/lead_scoring/features.py b/leadforge/schemes/lead_scoring/features.py new file mode 100644 index 0000000..b6949a1 --- /dev/null +++ b/leadforge/schemes/lead_scoring/features.py @@ -0,0 +1,271 @@ +"""Lead-scoring (``b2b_saas_procurement_v1``) feature spec. + +:data:`LEAD_SNAPSHOT_FEATURES` is the canonical ordered list of features +present in the primary lead-scoring task export. The shared +:class:`~leadforge.schema.features.FeatureSpec` primitive stays in +:mod:`leadforge.schema.features`. +""" + +from __future__ import annotations + +from leadforge.core.enums import ExposureMode +from leadforge.schema.features import FeatureSpec + + +def redacted_columns_for( + mode: ExposureMode, + features: tuple[FeatureSpec, ...] | None = None, +) -> frozenset[str]: + """Return the set of column names that must be stripped from *mode* exports. + + Args: + mode: The exposure mode being published. + features: Feature spec tuple to consult. Defaults to the canonical + :data:`LEAD_SNAPSHOT_FEATURES` list. + """ + if features is None: + features = LEAD_SNAPSHOT_FEATURES + return frozenset(f.name for f in features if mode in f.redact_in_modes) + + +# --------------------------------------------------------------------------- +# Canonical feature list — lead snapshot +# --------------------------------------------------------------------------- + +LEAD_SNAPSHOT_FEATURES: tuple[FeatureSpec, ...] = ( + # -- Account features -- + FeatureSpec("account_id", "string", "Opaque account identifier.", "account"), + FeatureSpec( + "industry", + "string", + "Industry vertical of the buying organization.", + "account", + ), + FeatureSpec( + "region", + "string", + "Geographic region of the account's headquarters.", + "account", + ), + FeatureSpec( + "employee_band", + "string", + "Banded employee headcount of the account.", + "account", + ), + FeatureSpec( + "estimated_revenue_band", + "string", + "Banded estimated annual revenue of the account.", + "account", + ), + FeatureSpec( + "process_maturity_band", + "string", + "Banded internal process maturity score (latent).", + "account", + leakage_risk=False, + ), + # -- Contact features -- + FeatureSpec("contact_id", "string", "Opaque contact identifier.", "contact"), + FeatureSpec( + "role_function", + "string", + "Functional area of the primary contact (e.g. finance, ops).", + "contact", + ), + FeatureSpec( + "seniority", + "string", + "Seniority band of the primary contact.", + "contact", + ), + FeatureSpec( + "buyer_role", + "string", + "Buyer role classification (economic_buyer, champion, etc.).", + "contact", + ), + # -- Lead metadata features -- + FeatureSpec("lead_id", "string", "Opaque lead identifier.", "lead_meta"), + FeatureSpec( + "lead_created_at", + "string", + "ISO-8601 timestamp when the lead was created.", + "lead_meta", + ), + FeatureSpec( + "lead_source", + "string", + "Origination source of the lead (e.g. inbound_form, sdr_outbound).", + "lead_meta", + ), + # Note: ``first_touch_channel`` is absent from this list. In v1 the + # simulation sets it to the same value as ``lead_source`` (both derive + # from the channel drawn during lead creation), making it byte-identical + # and zero-information. It is retained in the relational ``leads`` + # table for completeness; it is excluded from the flat snapshot because + # a duplicate column would be actively misleading in a teaching dataset. + FeatureSpec( + "current_stage", + "string", + "Funnel stage at snapshot anchor date. WARNING: at full-horizon " + "(90-day) snapshots this contains terminal stages (closed_won / " + "closed_lost) that encode the label. Exclude from modeling or use " + "a windowed snapshot.", + "lead_meta", + leakage_risk=True, + redact_in_modes=frozenset({ExposureMode.student_public}), + ), + # Note: ``is_mql`` was removed from the canonical feature list (issue #57) + # because every lead is initialised at MQL stage in + # ``leadforge/schemes/lead_scoring/simulation/population.py``, making the + # column constant ``True`` and zero-variance. The underlying + # ``LeadRow.is_mql`` field still lives on the relational ``leads.parquet`` + # table. + FeatureSpec( + "is_sql", + "boolean", + "Whether the lead had achieved SQL status at snapshot date. " + "Strongly correlated with the label: the simulator only converts " + "non-SQL leads via a rare direct-conversion path, so " + "is_sql=False predicts non-conversion with very high probability " + "(P(conv | is_sql=False) ≈ 0.04 / 0.015 / 0.006 across difficulty " + "tiers). Redacted from student_public bundles.", + "lead_meta", + leakage_risk=True, + redact_in_modes=frozenset({ExposureMode.student_public}), + ), + # -- Engagement features -- + FeatureSpec( + "touch_count", + "Int64", + "Total number of marketing/sales touches recorded before snapshot.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "inbound_touch_count", + "Int64", + "Number of inbound touches before snapshot.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "outbound_touch_count", + "Int64", + "Number of outbound touches before snapshot.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "session_count", + "Int64", + "Number of web/trial sessions recorded before snapshot.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "pricing_page_views", + "Int64", + "Cumulative pricing page views across all sessions before snapshot.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "demo_page_views", + "Int64", + "Cumulative demo page views across all sessions before snapshot.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "total_session_duration_seconds", + "Int64", + "Sum of session durations (seconds) before snapshot.", + "engagement", + non_negative=True, + ), + # -- Momentum features -- + FeatureSpec( + "touches_days_0_7", + "Int64", + "Number of touches in days 0–7 (inclusive) after lead creation.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "touches_last_7_days", + "Int64", + "Number of touches in the last 7 days before snapshot cutoff.", + "engagement", + non_negative=True, + ), + FeatureSpec( + "days_since_first_touch", + "Float64", + "Days between first touch and snapshot cutoff (NaN if no touches).", + "engagement", + non_negative=True, + ), + # -- Sales activity features -- + FeatureSpec( + "activity_count", + "Int64", + "Number of sales activities logged before snapshot.", + "sales", + non_negative=True, + ), + FeatureSpec( + "days_since_last_touch", + "Float64", + "Days elapsed between most recent touch and snapshot cutoff.", + "sales", + non_negative=True, + ), + FeatureSpec( + "opportunity_created", + "boolean", + "Whether any opportunity was created by snapshot date (open or closed).", + "sales", + ), + FeatureSpec( + "has_open_opportunity", + "boolean", + "Whether an open opportunity existed at snapshot date.", + "sales", + ), + FeatureSpec( + "opportunity_estimated_acv", + "Float64", + "Estimated ACV of the most recent open opportunity (NaN if none).", + "sales", + non_negative=True, + ), + FeatureSpec( + "expected_acv", + "Float64", + "Expected ACV: opportunity ACV if available by snapshot, else " + "revenue band midpoint heuristic (NaN if neither available).", + "sales", + non_negative=True, + ), + # -- Pedagogical leakage trap (deliberately retained in all modes) -- + FeatureSpec( + "total_touches_all", + "Int64", + "Total touches over full 90-day window. LEAKAGE TRAP: uses " + "post-snapshot data. Included for pedagogical purposes only.", + "engagement", + leakage_risk=True, + ), + # -- Target -- + FeatureSpec( + "converted_within_90_days", + "boolean", + "Label: True if a closed_won event occurred within 90 days of " + "the snapshot anchor date. Derived from simulated events.", + "target", + is_target=True, + ), +) diff --git a/leadforge/schemes/lead_scoring/relationships.py b/leadforge/schemes/lead_scoring/relationships.py new file mode 100644 index 0000000..505e21f --- /dev/null +++ b/leadforge/schemes/lead_scoring/relationships.py @@ -0,0 +1,23 @@ +"""Lead-scoring (``b2b_saas_procurement_v1``) foreign-key constraints. + +``ALL_CONSTRAINTS`` lives here; the shared primitives (:class:`FKConstraint`, +:func:`validate_fk`, :class:`FKViolationError`) remain in +:mod:`leadforge.schema.relationships`. +""" + +from __future__ import annotations + +from leadforge.schema.relationships import FKConstraint + +ALL_CONSTRAINTS: tuple[FKConstraint, ...] = ( + FKConstraint("contacts", "account_id", "accounts", "account_id"), + FKConstraint("leads", "account_id", "accounts", "account_id"), + FKConstraint("leads", "contact_id", "contacts", "contact_id"), + FKConstraint("touches", "lead_id", "leads", "lead_id"), + FKConstraint("sessions", "lead_id", "leads", "lead_id"), + FKConstraint("sales_activities", "lead_id", "leads", "lead_id"), + FKConstraint("opportunities", "lead_id", "leads", "lead_id"), + FKConstraint("customers", "opportunity_id", "opportunities", "opportunity_id"), + FKConstraint("customers", "account_id", "accounts", "account_id"), + FKConstraint("subscriptions", "customer_id", "customers", "customer_id"), +) diff --git a/leadforge/schemes/lead_scoring/render/relational.py b/leadforge/schemes/lead_scoring/render/relational.py index 62387e2..8e1b0e1 100644 --- a/leadforge/schemes/lead_scoring/render/relational.py +++ b/leadforge/schemes/lead_scoring/render/relational.py @@ -13,7 +13,7 @@ import pandas as pd -from leadforge.schema.entities import ( +from leadforge.schemes.lead_scoring.entities import ( AccountRow, ContactRow, CustomerRow, diff --git a/leadforge/schemes/lead_scoring/render/snapshots.py b/leadforge/schemes/lead_scoring/render/snapshots.py index b770eee..e905957 100644 --- a/leadforge/schemes/lead_scoring/render/snapshots.py +++ b/leadforge/schemes/lead_scoring/render/snapshots.py @@ -17,13 +17,13 @@ import pandas as pd from leadforge.core.rng import RNGRoot -from leadforge.schema.entities import ( +from leadforge.schemes.lead_scoring.entities import ( OpportunityRow, SalesActivityRow, SessionRow, TouchRow, ) -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES from leadforge.schemes.lead_scoring.simulation.population import REVENUE_BAND_MIDPOINTS if TYPE_CHECKING: diff --git a/leadforge/schemes/lead_scoring/render/tasks.py b/leadforge/schemes/lead_scoring/render/tasks.py index d8e23a6..8a69882 100644 --- a/leadforge/schemes/lead_scoring/render/tasks.py +++ b/leadforge/schemes/lead_scoring/render/tasks.py @@ -14,7 +14,7 @@ import pandas as pd from leadforge.core.rng import RNGRoot -from leadforge.schema.tasks import CONVERTED_WITHIN_90_DAYS, TaskManifest +from leadforge.schemes.lead_scoring.tasks import CONVERTED_WITHIN_90_DAYS, TaskManifest def write_task_splits( diff --git a/leadforge/schemes/lead_scoring/simulation/engine.py b/leadforge/schemes/lead_scoring/simulation/engine.py index f57f346..1683aa7 100644 --- a/leadforge/schemes/lead_scoring/simulation/engine.py +++ b/leadforge/schemes/lead_scoring/simulation/engine.py @@ -38,10 +38,10 @@ Post-simulation entity creation -------------------------------- -- An :class:`~leadforge.schema.entities.OpportunityRow` is created for every +- An :class:`~leadforge.schemes.lead_scoring.entities.OpportunityRow` is created for every lead that reached ``sql`` or any deeper stage. -- :class:`~leadforge.schema.entities.CustomerRow` and - :class:`~leadforge.schema.entities.SubscriptionRow` are created only for +- :class:`~leadforge.schemes.lead_scoring.entities.CustomerRow` and + :class:`~leadforge.schemes.lead_scoring.entities.SubscriptionRow` are created only for converted leads (``closed_won``). """ @@ -54,7 +54,7 @@ from leadforge.core.ids import ID_PREFIXES, make_id from leadforge.core.models import GenerationConfig from leadforge.core.rng import RNGRoot -from leadforge.schema.entities import ( +from leadforge.schemes.lead_scoring.entities import ( CustomerRow, LeadRow, OpportunityRow, @@ -147,7 +147,7 @@ class SimulationResult: ascending lead-index across leads). Args: - leads: Updated :class:`~leadforge.schema.entities.LeadRow` list + leads: Updated :class:`~leadforge.schemes.lead_scoring.entities.LeadRow` list with simulation outcomes filled in. """ diff --git a/leadforge/schemes/lead_scoring/simulation/population.py b/leadforge/schemes/lead_scoring/simulation/population.py index ffc5fcf..bc9e133 100644 --- a/leadforge/schemes/lead_scoring/simulation/population.py +++ b/leadforge/schemes/lead_scoring/simulation/population.py @@ -32,7 +32,7 @@ from leadforge.core.ids import ID_PREFIXES, make_id from leadforge.core.models import GenerationConfig from leadforge.core.rng import RNGRoot -from leadforge.schema.entities import AccountRow, ContactRow, LeadRow +from leadforge.schemes.lead_scoring.entities import AccountRow, ContactRow, LeadRow if TYPE_CHECKING: from leadforge.narrative.spec import NarrativeSpec diff --git a/leadforge/schemes/lead_scoring/simulation/state.py b/leadforge/schemes/lead_scoring/simulation/state.py index 2ad421b..b1d1c94 100644 --- a/leadforge/schemes/lead_scoring/simulation/state.py +++ b/leadforge/schemes/lead_scoring/simulation/state.py @@ -3,7 +3,7 @@ :class:`LeadSimState` is the only mutable object touched by :func:`~leadforge.schemes.lead_scoring.simulation.engine.simulate_world`. After the simulation loop completes, the final state of each instance is used to populate the -:class:`~leadforge.schema.entities.LeadRow` and any post-conversion entity +:class:`~leadforge.schemes.lead_scoring.entities.LeadRow` and any post-conversion entity rows (opportunity, customer, subscription). """ diff --git a/leadforge/schemes/lead_scoring/tasks.py b/leadforge/schemes/lead_scoring/tasks.py new file mode 100644 index 0000000..1f2b8de --- /dev/null +++ b/leadforge/schemes/lead_scoring/tasks.py @@ -0,0 +1,63 @@ +"""Lead-scoring (``b2b_saas_procurement_v1``) task definitions. + +:data:`CONVERTED_WITHIN_90_DAYS` and :func:`task_manifest_for_config` live here. +The shared primitives (:class:`~leadforge.schema.tasks.SplitSpec`, +:class:`~leadforge.schema.tasks.TaskManifest`) stay in +:mod:`leadforge.schema.tasks`. +""" + +from __future__ import annotations + +from dataclasses import replace + +from leadforge.schema.tasks import SplitSpec, TaskManifest + +CONVERTED_WITHIN_90_DAYS: TaskManifest = TaskManifest( + task_id="converted_within_90_days", + label_column="converted_within_90_days", + label_window_days=90, + primary_table="leads", + split=SplitSpec(train=0.7, valid=0.15, test=0.15), + task_type="binary_classification", + description=( + "A lead is considered converted if a `closed_won` event is recorded " + "within 90 days of the lead's snapshot anchor date. The label is " + "event-derived — never sampled directly. All features are pre-anchor " + "(leakage-free by construction)." + ), +) + + +def task_manifest_for_config( + primary_task: str = CONVERTED_WITHIN_90_DAYS.task_id, + label_window_days: int = CONVERTED_WITHIN_90_DAYS.label_window_days, +) -> TaskManifest: + """Build a :class:`~leadforge.schema.tasks.TaskManifest` from config fields. + + Derives from :data:`CONVERTED_WITHIN_90_DAYS` via ``dataclasses.replace``, + overriding only the fields that vary. + + Args: + primary_task: Task identifier — used as the task directory name. + label_window_days: Label observation window in days. + """ + if primary_task == CONVERTED_WITHIN_90_DAYS.task_id: + description = ( + f"A lead is considered converted if a `closed_won` event is recorded " + f"within {label_window_days} days of the lead's snapshot anchor date. " + f"The label is event-derived — never sampled directly. All features " + f"are pre-anchor (leakage-free by construction)." + ) + else: + description = ( + f"Binary label `{primary_task}` evaluated over a " + f"{label_window_days}-day window from the snapshot anchor date. " + f"The label is event-derived — never sampled directly. All features " + f"are pre-anchor (leakage-free by construction)." + ) + return replace( + CONVERTED_WITHIN_90_DAYS, + task_id=primary_task, + label_window_days=label_window_days, + description=description, + ) diff --git a/leadforge/schemes/lifecycle/entities.py b/leadforge/schemes/lifecycle/entities.py index 9dfc973..e85b3a3 100644 --- a/leadforge/schemes/lifecycle/entities.py +++ b/leadforge/schemes/lifecycle/entities.py @@ -6,8 +6,8 @@ column schemas are unaffected. The lifecycle bundle's ``customers`` and ``subscriptions`` tables are richer -than the thin lead-scoring :class:`~leadforge.schema.entities.CustomerRow` / -:class:`~leadforge.schema.entities.SubscriptionRow` (which exist only to record +than the thin lead-scoring :class:`~leadforge.schemes.lead_scoring.entities.CustomerRow` / +:class:`~leadforge.schemes.lead_scoring.entities.SubscriptionRow` (which exist only to record conversion in the procurement world). Rather than extend those classes in place — which would change the lead-scoring instructor bundle's parquet schema, since ``to_dict()`` emits every field — the lifecycle bundle uses the dedicated diff --git a/leadforge/validation/bundle_checks.py b/leadforge/validation/bundle_checks.py index ad4215c..98e06d1 100644 --- a/leadforge/validation/bundle_checks.py +++ b/leadforge/validation/bundle_checks.py @@ -18,8 +18,8 @@ from leadforge.core.exceptions import LeadforgeError from leadforge.core.hashing import file_sha256 from leadforge.core.serialization import load_json -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES, redacted_columns_for -from leadforge.schema.relationships import ALL_CONSTRAINTS +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES, redacted_columns_for +from leadforge.schemes.lead_scoring.relationships import ALL_CONSTRAINTS from leadforge.validation.difficulty import check_difficulty from leadforge.validation.leakage_probes import ( BANNED_TABLES, diff --git a/leadforge/validation/invariants.py b/leadforge/validation/invariants.py index d1423a3..84bf3a7 100644 --- a/leadforge/validation/invariants.py +++ b/leadforge/validation/invariants.py @@ -17,7 +17,7 @@ from leadforge.core.enums import ExposureMode from leadforge.core.hashing import file_sha256 from leadforge.render.manifests import NON_DETERMINISTIC_MANIFEST_FIELDS -from leadforge.schema.features import redacted_columns_for +from leadforge.schemes.lead_scoring.features import redacted_columns_for from leadforge.validation.leakage_probes import ( BANNED_LEAD_COLUMNS, BANNED_OPP_COLUMNS, diff --git a/leadforge/validation/realism.py b/leadforge/validation/realism.py index afa0187..7672a9a 100644 --- a/leadforge/validation/realism.py +++ b/leadforge/validation/realism.py @@ -13,7 +13,7 @@ import pandas as pd import pyarrow.parquet as pq -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES # Derive check lists from the canonical schema to avoid silent drift. _COUNT_FEATURES = [f.name for f in LEAD_SNAPSHOT_FEATURES if f.dtype == "Int64"] @@ -49,7 +49,7 @@ def _first_task_train_path(root: Path, manifest: dict[str, Any]) -> Path | None: # The label column in the snapshot is always ``converted_within_90_days`` -# (mirroring :class:`~leadforge.schema.entities.LeadRow`). The task *directory* +# (mirroring :class:`~leadforge.schemes.lead_scoring.entities.LeadRow`). The task *directory* # may vary via ``config.primary_task``, but the column inside does not. _LABEL_COLUMN = "converted_within_90_days" diff --git a/leadforge/validation/release_quality.py b/leadforge/validation/release_quality.py index 87b6883..76e3e5f 100644 --- a/leadforge/validation/release_quality.py +++ b/leadforge/validation/release_quality.py @@ -42,7 +42,7 @@ import pandas as pd from leadforge.core.serialization import load_json -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES # --------------------------------------------------------------------------- # Constants diff --git a/tests/exposure/test_redaction.py b/tests/exposure/test_redaction.py index 18852ff..c3cef0f 100644 --- a/tests/exposure/test_redaction.py +++ b/tests/exposure/test_redaction.py @@ -19,10 +19,7 @@ from leadforge.api.generator import Generator from leadforge.core.enums import ExposureMode -from leadforge.schema.features import ( - LEAD_SNAPSHOT_FEATURES, - redacted_columns_for, -) +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES, redacted_columns_for from leadforge.validation.bundle_checks import validate_bundle _SMALL = {"n_leads": 30, "n_accounts": 15, "n_contacts": 45} diff --git a/tests/narrative/test_dataset_card.py b/tests/narrative/test_dataset_card.py index bd7ab0d..752832d 100644 --- a/tests/narrative/test_dataset_card.py +++ b/tests/narrative/test_dataset_card.py @@ -3,7 +3,8 @@ from leadforge.api.generator import Generator from leadforge.core.models import GenerationConfig, WorldSpec from leadforge.narrative.dataset_card import render_dataset_card -from leadforge.schema.tasks import SplitSpec, TaskManifest, task_manifest_for_config +from leadforge.schema.tasks import SplitSpec, TaskManifest +from leadforge.schemes.lead_scoring.tasks import task_manifest_for_config def _make_world_spec(**kwargs: object) -> WorldSpec: diff --git a/tests/render/test_render.py b/tests/render/test_render.py index 0bb7bc9..e96d8d2 100644 --- a/tests/render/test_render.py +++ b/tests/render/test_render.py @@ -9,7 +9,7 @@ from leadforge.core.models import GenerationConfig from leadforge.core.rng import RNGRoot -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES from leadforge.schemes.lead_scoring.simulation.engine import simulate_world from leadforge.schemes.lead_scoring.simulation.population import build_population from leadforge.schemes.lead_scoring.structure.sampler import sample_hidden_graph @@ -100,7 +100,7 @@ def test_dataframes_are_dataframes(self, sim_outputs): def test_empty_tables_have_schema(self, sim_outputs): """Tables with zero rows must still expose the correct column names.""" _, population, result, _ = sim_outputs - from leadforge.schema.entities import CustomerRow + from leadforge.schemes.lead_scoring.entities import CustomerRow from leadforge.schemes.lead_scoring.render.relational import to_dataframes dfs = to_dataframes(result, population) @@ -109,7 +109,8 @@ def test_empty_tables_have_schema(self, sim_outputs): def test_fk_integrity(self, sim_outputs): """All FK constraints must hold on the produced DataFrames.""" _, population, result, _ = sim_outputs - from leadforge.schema.relationships import ALL_CONSTRAINTS, validate_fk + from leadforge.schema.relationships import validate_fk + from leadforge.schemes.lead_scoring.relationships import ALL_CONSTRAINTS from leadforge.schemes.lead_scoring.render.relational import to_dataframes dfs = to_dataframes(result, population) diff --git a/tests/schema/test_entities.py b/tests/schema/test_entities.py index 61e0c4e..c7a5d78 100644 --- a/tests/schema/test_entities.py +++ b/tests/schema/test_entities.py @@ -5,7 +5,8 @@ import pandas as pd import pytest -from leadforge.schema.entities import ( +from leadforge.schema.tables import read_parquet, write_parquet +from leadforge.schemes.lead_scoring.entities import ( ALL_ROW_TYPES, TABLE_NAMES, AccountRow, @@ -14,7 +15,6 @@ SessionRow, TouchRow, ) -from leadforge.schema.tables import read_parquet, write_parquet # --------------------------------------------------------------------------- # Helpers diff --git a/tests/schema/test_features.py b/tests/schema/test_features.py index ca8c473..6ff1e68 100644 --- a/tests/schema/test_features.py +++ b/tests/schema/test_features.py @@ -7,7 +7,7 @@ import pytest from leadforge.schema.dictionaries import feature_dictionary_df, write_feature_dictionary -from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec # --------------------------------------------------------------------------- # FeatureSpec @@ -95,7 +95,7 @@ def test_total_touches_all_kept_as_pedagogical_trap() -> None: def test_redacted_columns_for_student_public() -> None: from leadforge.core.enums import ExposureMode - from leadforge.schema.features import redacted_columns_for + from leadforge.schemes.lead_scoring.features import redacted_columns_for redacted = redacted_columns_for(ExposureMode.student_public) assert "current_stage" in redacted @@ -105,7 +105,7 @@ def test_redacted_columns_for_student_public() -> None: def test_redacted_columns_for_research_instructor_is_empty() -> None: from leadforge.core.enums import ExposureMode - from leadforge.schema.features import redacted_columns_for + from leadforge.schemes.lead_scoring.features import redacted_columns_for assert redacted_columns_for(ExposureMode.research_instructor) == frozenset() @@ -113,7 +113,8 @@ def test_redacted_columns_for_research_instructor_is_empty() -> None: def test_redacted_columns_for_accepts_custom_features() -> None: """The function is parameterizable — future per-recipe feature sets work.""" from leadforge.core.enums import ExposureMode - from leadforge.schema.features import FeatureSpec, redacted_columns_for + from leadforge.schema.features import FeatureSpec + from leadforge.schemes.lead_scoring.features import redacted_columns_for custom = ( FeatureSpec( diff --git a/tests/schema/test_relationships.py b/tests/schema/test_relationships.py index 8e8118f..d8bd9d6 100644 --- a/tests/schema/test_relationships.py +++ b/tests/schema/test_relationships.py @@ -5,11 +5,11 @@ import pytest from leadforge.schema.relationships import ( - ALL_CONSTRAINTS, FKConstraint, FKViolationError, validate_fk, ) +from leadforge.schemes.lead_scoring.relationships import ALL_CONSTRAINTS def test_all_constraints_count() -> None: diff --git a/tests/schema/test_tasks.py b/tests/schema/test_tasks.py index 4039a6b..0e5b313 100644 --- a/tests/schema/test_tasks.py +++ b/tests/schema/test_tasks.py @@ -4,7 +4,7 @@ import pytest -from leadforge.schema.tasks import CONVERTED_WITHIN_90_DAYS, SplitSpec +from leadforge.schemes.lead_scoring.tasks import CONVERTED_WITHIN_90_DAYS, SplitSpec # --------------------------------------------------------------------------- # SplitSpec diff --git a/tests/schemes/lifecycle/test_entities.py b/tests/schemes/lifecycle/test_entities.py index 41f0df7..4bfcd87 100644 --- a/tests/schemes/lifecycle/test_entities.py +++ b/tests/schemes/lifecycle/test_entities.py @@ -11,9 +11,9 @@ import pytest from leadforge.core.ids import ID_PREFIXES, make_id -from leadforge.schema.entities import ALL_ROW_TYPES, TABLE_NAMES, AccountRow -from leadforge.schema.relationships import ALL_CONSTRAINTS, FKConstraint from leadforge.schema.tables import read_parquet, write_parquet +from leadforge.schemes.lead_scoring.entities import ALL_ROW_TYPES, TABLE_NAMES, AccountRow +from leadforge.schemes.lead_scoring.relationships import ALL_CONSTRAINTS, FKConstraint from leadforge.schemes.lifecycle.entities import ( LIFECYCLE_ROW_TYPES, LIFECYCLE_TABLE_NAMES, diff --git a/tests/schemes/test_module_layout.py b/tests/schemes/test_module_layout.py index 3ba22d4..aee1c44 100644 --- a/tests/schemes/test_module_layout.py +++ b/tests/schemes/test_module_layout.py @@ -27,6 +27,7 @@ "leadforge.schemes.lead_scoring.render.relational_snapshot_safe", ), ("leadforge.render.tasks", "leadforge.schemes.lead_scoring.render.tasks"), + ] @@ -70,6 +71,44 @@ def test_relational_split_to_dataframes_moved_to_scheme() -> None: importlib.import_module("leadforge.render.relational") +def test_schema_split_primitives_stay_in_schema() -> None: + # LTV-Pg.2: shared primitives kept in schema/, not moved with the rows. + from leadforge.schema.entities import ( # noqa: F401 + AccountRow, + EntityRowProtocol, + make_empty_dataframe, + ) + from leadforge.schema.features import FeatureSpec # noqa: F401 + from leadforge.schema.relationships import FKConstraint, validate_fk # noqa: F401 + from leadforge.schema.tasks import SplitSpec, TaskManifest # noqa: F401 + + +def test_schema_split_lead_scoring_specifics_in_scheme() -> None: + # LTV-Pg.2: lead-scoring-specific symbols live in the scheme package. + from leadforge.schemes.lead_scoring.entities import ( # noqa: F401 + ALL_ROW_TYPES, + ContactRow, + LeadRow, + ) + from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES # noqa: F401 + from leadforge.schemes.lead_scoring.relationships import ALL_CONSTRAINTS # noqa: F401 + from leadforge.schemes.lead_scoring.tasks import CONVERTED_WITHIN_90_DAYS # noqa: F401 + + +def test_schema_split_lead_scoring_removed_from_shared_schema() -> None: + # LTV-Pg.2: moved symbols are gone from the shared schema namespace. + import leadforge.schema.entities as shared_entities + import leadforge.schema.features as shared_features + import leadforge.schema.relationships as shared_relationships + import leadforge.schema.tasks as shared_tasks + + assert not hasattr(shared_entities, "LeadRow") + assert not hasattr(shared_entities, "ALL_ROW_TYPES") + assert not hasattr(shared_features, "LEAD_SNAPSHOT_FEATURES") + assert not hasattr(shared_relationships, "ALL_CONSTRAINTS") + assert not hasattr(shared_tasks, "CONVERTED_WITHIN_90_DAYS") + + def test_public_api_unchanged_by_the_move() -> None: # The documented public surface must keep importing from its stable home. from leadforge.api import Generator, list_recipes # noqa: F401 diff --git a/tests/scripts/test_build_v7_snapshot.py b/tests/scripts/test_build_v7_snapshot.py index 0993c32..0822d25 100644 --- a/tests/scripts/test_build_v7_snapshot.py +++ b/tests/scripts/test_build_v7_snapshot.py @@ -357,7 +357,7 @@ def test_empty_touches(self): def test_counts_post_snapshot_touches_correctly(self): """Touches after snapshot_day should be counted; on or before should not.""" - from leadforge.schema.entities import TouchRow + from leadforge.schemes.lead_scoring.entities import TouchRow snapshot = pd.DataFrame({"lead_id": ["lead_000001", "lead_000002"]}) lead_dates = {"lead_000001": "2024-01-01", "lead_000002": "2024-01-01"} @@ -378,7 +378,7 @@ def test_counts_post_snapshot_touches_correctly(self): def test_boundary_day_excluded(self): """Touch on exactly snapshot_day must be excluded.""" - from leadforge.schema.entities import TouchRow + from leadforge.schemes.lead_scoring.entities import TouchRow snapshot = pd.DataFrame({"lead_id": ["lead_000001"]}) lead_dates = {"lead_000001": "2024-01-01"} @@ -390,7 +390,7 @@ def test_boundary_day_excluded(self): def test_horizon_boundary_included(self): """Touch on exactly horizon_day should be included.""" - from leadforge.schema.entities import TouchRow + from leadforge.schemes.lead_scoring.entities import TouchRow snapshot = pd.DataFrame({"lead_id": ["lead_000001"]}) lead_dates = {"lead_000001": "2024-01-01"} @@ -407,7 +407,7 @@ def test_horizon_boundary_included(self): def test_lead_with_no_touches_gets_zero(self): """Lead absent from touch list should get 0.""" - from leadforge.schema.entities import TouchRow + from leadforge.schemes.lead_scoring.entities import TouchRow snapshot = pd.DataFrame({"lead_id": ["lead_000001", "lead_000002"]}) lead_dates = {"lead_000001": "2024-01-01", "lead_000002": "2024-01-01"} @@ -420,7 +420,7 @@ def test_lead_with_no_touches_gets_zero(self): def test_no_label_injection_behavioral(self): """Two datasets with different labels must produce identical trap values.""" - from leadforge.schema.entities import TouchRow + from leadforge.schemes.lead_scoring.entities import TouchRow snapshot_a = pd.DataFrame( { diff --git a/tests/simulation/test_engine.py b/tests/simulation/test_engine.py index 44ee64b..07452c9 100644 --- a/tests/simulation/test_engine.py +++ b/tests/simulation/test_engine.py @@ -6,7 +6,7 @@ from leadforge.core.models import GenerationConfig from leadforge.core.rng import RNGRoot -from leadforge.schema.entities import ( +from leadforge.schemes.lead_scoring.entities import ( CustomerRow, LeadRow, OpportunityRow, diff --git a/tests/test_primary_task_threading.py b/tests/test_primary_task_threading.py index 602a0e0..829e5e2 100644 --- a/tests/test_primary_task_threading.py +++ b/tests/test_primary_task_threading.py @@ -17,7 +17,7 @@ from leadforge.core.serialization import load_json from leadforge.pipelines.build_v5 import rename_and_select as v5_rename from leadforge.pipelines.build_v6 import rename_and_select as v6_rename -from leadforge.schema.tasks import CONVERTED_WITHIN_90_DAYS, task_manifest_for_config +from leadforge.schemes.lead_scoring.tasks import CONVERTED_WITHIN_90_DAYS, task_manifest_for_config from leadforge.validation.drift import check_cross_seed_stability from leadforge.validation.realism import check_realism diff --git a/tests/validation/test_realism.py b/tests/validation/test_realism.py index 4b591ce..f0f4fcd 100644 --- a/tests/validation/test_realism.py +++ b/tests/validation/test_realism.py @@ -91,7 +91,7 @@ def test_detects_non_boolean_feature(self, tmp_path: Path, bundle_dir: Path) -> # Pick the first non-target boolean column at test time so this # test self-heals when feature names change. Falls back gracefully # if the spec ever has zero non-target booleans (currently impossible). - from leadforge.schema.features import LEAD_SNAPSHOT_FEATURES + from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES bool_col = next( f.name From 4caa1a3041df582f30e9d0b4921abbff48bc4672 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 08:19:38 +0300 Subject: [PATCH 2/3] docs(ltv): record LTV-Pg.2 (#112) in roadmap + agent-plan [LTV-Pg.2] Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 2 +- docs/ltv/roadmap.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index 1850a8d..fc4b7fb 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -50,7 +50,7 @@ shared write_relational_tables stays in the envelope) merged (#110). `LTV-Pg` registries + LIFECYCLE_CONSTRAINTS moved there, stub LifecycleScheme registered) merged (#111); Pg.2 (split lead-scoring schema: entity rows/ALL_ROW_TYPES/ ALL_CONSTRAINTS/LEAD_SNAPSHOT_FEATURES/CONVERTED_WITHIN_90_DAYS moved to -`schemes/lead_scoring/`; shared primitives stay in `schema/`) opened as **#NNN**. All M2 moves byte-identical. Sibling `leadforge-datasets-private` +`schemes/lead_scoring/`; shared primitives stay in `schema/`) opened as **#112**. All M2 moves byte-identical. Sibling `leadforge-datasets-private` consumes bundle files, not internals — no lockstep update needed (heads-up issue #8). Next: `LTV-Pg.2`, then `LTV-Pc` (pLTV feature/task specs, authored in `schemes/lifecycle/`), then `LTV-M3` (lifecycle population). diff --git a/docs/ltv/roadmap.md b/docs/ltv/roadmap.md index 77b89e6..231ec9c 100644 --- a/docs/ltv/roadmap.md +++ b/docs/ltv/roadmap.md @@ -42,7 +42,7 @@ protocol + registry, with the package physically reorganized into |-----------|------------|-----|------------| | `LTV-M0` | Planning + design lock | `LTV-Pa` | #102, #103 (+ scheme reframe) | | `LTV-M1` | Lifecycle schema foundation | `LTV-Pb`, `LTV-Pc` | #104 (Pb) | -| `LTV-M2` | Generation-scheme architecture + physical reorg | `LTV-Pd`, `LTV-Pe`, `LTV-Pf`, `LTV-Pg` | #107 (Pd), #108 (Pe), #109 (Pf.1), #110 (Pf.2), #111 (Pg.1) | +| `LTV-M2` | Generation-scheme architecture + physical reorg | `LTV-Pd`, `LTV-Pe`, `LTV-Pf`, `LTV-Pg` | #107 (Pd), #108 (Pe), #109 (Pf.1), #110 (Pf.2), #111 (Pg.1), #112 (Pg.2) | | `LTV-M3` | Customer population + lifecycle world | `LTV-Ph`, `LTV-Pi` | | | `LTV-M4` | Lifecycle simulation engine | `LTV-Pj`, `LTV-Pk` | | | `LTV-M5` | Customer snapshots + pLTV targets (both regimes) | `LTV-Pl`, `LTV-Pm` | | @@ -142,7 +142,7 @@ Total: ~19 PRs across 9 milestones. Shared primitives (`EntityRowProtocol`, `_empty_df`, `AccountRow`, `FKConstraint`) stay in `schema/` and are imported. Byte-identical; full suite green. (**PR #111**) - - [x] **`LTV-Pg.2`** — split the **lead-scoring** schema (**PR #NNN**): move the + - [x] **`LTV-Pg.2`** — split the **lead-scoring** schema (**PR #112**): move the lead-scoring entity rows / `ALL_ROW_TYPES` / `ALL_CONSTRAINTS` / `LEAD_SNAPSHOT_FEATURES` / task specs into `schemes/lead_scoring/`, leaving only genuinely shared primitives in `schema/`. (The lifecycle `LTV-Pc` From 42f4524be8290c2ba22774467ddae11cf9797aa1 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 11 Jun 2026 14:10:24 +0300 Subject: [PATCH 3/3] refactor(schema): fix Pg.2 review findings (self-review) [LTV-Pg.2] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three concrete fixes after hostile self-review of #112: 1. dictionaries.py imported FeatureSpec from the lead-scoring scheme module — a shared-envelope module pulling a shared primitive via a scheme-specific path (same smell as the _empty_df→make_empty_dataframe catch in Pg.1). Fixed: FeatureSpec now imported from schema.features (its actual home); only LEAD_SNAPSHOT_FEATURES comes from schemes.lead_scoring.features. 2. redacted_columns_for signature silently changed from `features: tuple[FeatureSpec, ...] = LEAD_SNAPSHOT_FEATURES` to `features: tuple[FeatureSpec, ...] | None = None` (behavioral change in a refactor — None was previously a TypeError, now silently substitutes the default). Restored the original honest signature; moved the function to after LEAD_SNAPSHOT_FEATURES in the file to avoid the forward-reference problem that caused the change in the first place. 3. Lead-scoring schema tests moved from tests/schema/ to tests/schemes/lead_scoring/ for consistency with the lifecycle precedent (test_lifecycle_entities.py → tests/schemes/lifecycle/test_entities.py in Pg.1). Having tests/schema/ test_entities.py import from schemes/lead_scoring/entities was jarring. tests/schema/ now only contains test_ids.py (tests the shared IDs module). 4. Stale docstring in exposure/filters.py updated: referenced leadforge.schema.features.redacted_columns_for which moved. Verified byte-identical (14/14); full suite 1537 passed / 51 skipped; ruff + mypy clean. Co-Authored-By: Claude Sonnet 4.6 --- leadforge/exposure/filters.py | 2 +- leadforge/schema/dictionaries.py | 3 +- leadforge/schemes/lead_scoring/features.py | 31 +++++++++---------- tests/schemes/lead_scoring/__init__.py | 0 .../lead_scoring}/test_entities.py | 0 .../lead_scoring}/test_features.py | 0 .../lead_scoring}/test_relationships.py | 0 .../lead_scoring}/test_tasks.py | 0 8 files changed, 17 insertions(+), 19 deletions(-) create mode 100644 tests/schemes/lead_scoring/__init__.py rename tests/{schema => schemes/lead_scoring}/test_entities.py (100%) rename tests/{schema => schemes/lead_scoring}/test_features.py (100%) rename tests/{schema => schemes/lead_scoring}/test_relationships.py (100%) rename tests/{schema => schemes/lead_scoring}/test_tasks.py (100%) diff --git a/leadforge/exposure/filters.py b/leadforge/exposure/filters.py index f78eda9..537dd12 100644 --- a/leadforge/exposure/filters.py +++ b/leadforge/exposure/filters.py @@ -6,7 +6,7 @@ The per-feature redaction policy lives separately on :attr:`leadforge.schema.features.FeatureSpec.redact_in_modes` and is queried -via :func:`leadforge.schema.features.redacted_columns_for`. ``BundleFilter`` +via :func:`leadforge.schemes.lead_scoring.features.redacted_columns_for`. ``BundleFilter`` deliberately does *not* duplicate that information so that the writer and the validator both consult the same source of truth. diff --git a/leadforge/schema/dictionaries.py b/leadforge/schema/dictionaries.py index fc46b30..2453115 100644 --- a/leadforge/schema/dictionaries.py +++ b/leadforge/schema/dictionaries.py @@ -11,7 +11,8 @@ import pandas as pd -from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES, FeatureSpec +from leadforge.schema.features import FeatureSpec +from leadforge.schemes.lead_scoring.features import LEAD_SNAPSHOT_FEATURES _COLUMNS = ("name", "dtype", "description", "category", "is_target", "leakage_risk") diff --git a/leadforge/schemes/lead_scoring/features.py b/leadforge/schemes/lead_scoring/features.py index b6949a1..583c30d 100644 --- a/leadforge/schemes/lead_scoring/features.py +++ b/leadforge/schemes/lead_scoring/features.py @@ -11,23 +11,6 @@ from leadforge.core.enums import ExposureMode from leadforge.schema.features import FeatureSpec - -def redacted_columns_for( - mode: ExposureMode, - features: tuple[FeatureSpec, ...] | None = None, -) -> frozenset[str]: - """Return the set of column names that must be stripped from *mode* exports. - - Args: - mode: The exposure mode being published. - features: Feature spec tuple to consult. Defaults to the canonical - :data:`LEAD_SNAPSHOT_FEATURES` list. - """ - if features is None: - features = LEAD_SNAPSHOT_FEATURES - return frozenset(f.name for f in features if mode in f.redact_in_modes) - - # --------------------------------------------------------------------------- # Canonical feature list — lead snapshot # --------------------------------------------------------------------------- @@ -269,3 +252,17 @@ def redacted_columns_for( is_target=True, ), ) + + +def redacted_columns_for( + mode: ExposureMode, + features: tuple[FeatureSpec, ...] = LEAD_SNAPSHOT_FEATURES, +) -> frozenset[str]: + """Return the set of column names that must be stripped from *mode* exports. + + Args: + mode: The exposure mode being published. + features: Feature spec tuple to consult. Defaults to the canonical + :data:`LEAD_SNAPSHOT_FEATURES` list. + """ + return frozenset(f.name for f in features if mode in f.redact_in_modes) diff --git a/tests/schemes/lead_scoring/__init__.py b/tests/schemes/lead_scoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/schema/test_entities.py b/tests/schemes/lead_scoring/test_entities.py similarity index 100% rename from tests/schema/test_entities.py rename to tests/schemes/lead_scoring/test_entities.py diff --git a/tests/schema/test_features.py b/tests/schemes/lead_scoring/test_features.py similarity index 100% rename from tests/schema/test_features.py rename to tests/schemes/lead_scoring/test_features.py diff --git a/tests/schema/test_relationships.py b/tests/schemes/lead_scoring/test_relationships.py similarity index 100% rename from tests/schema/test_relationships.py rename to tests/schemes/lead_scoring/test_relationships.py diff --git a/tests/schema/test_tasks.py b/tests/schemes/lead_scoring/test_tasks.py similarity index 100% rename from tests/schema/test_tasks.py rename to tests/schemes/lead_scoring/test_tasks.py