diff --git a/.agent-plan.md b/.agent-plan.md index 87b81c2..4d20020 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -45,10 +45,14 @@ and `LTV-Pe` (#108) merged (scheme protocol + render seam). `LTV-Pf` (physical move, **hard break / no shims** per D12): Pf.1 (compute core — simulation/mechanisms/structure) merged (#109); Pf.2 (lead-scoring render — snapshots/relational_snapshot_safe/tasks moved + relational.py split so the -shared write_relational_tables stays in the envelope) opened as **#110**. Both -byte-identical. Sibling `leadforge-datasets-private` consumes bundle files, not -internals — no lockstep update needed (heads-up issue #8 filed). Next: -`LTV-Pg` (scaffold `schemes/lifecycle/` + relocate the lead-scoring schema specs). +shared write_relational_tables stays in the envelope) merged (#110). `LTV-Pg` +(schema reorg) split: Pg.1 (scaffold `schemes/lifecycle/` — lifecycle rows + +registries + LIFECYCLE_CONSTRAINTS moved there, stub LifecycleScheme registered) +opened as **#111**; Pg.2 (split the lead-scoring schema out of shared `schema/`) +pending. All M2 moves byte-identical. Sibling `leadforge-datasets-private` +consumes bundle files, not internals — no lockstep update needed (heads-up +issue #8). Next: `LTV-Pg.2`, then `LTV-Pc` (pLTV feature/task specs, authored in +`schemes/lifecycle/`), then `LTV-M3` (lifecycle population). --- diff --git a/CHANGELOG.md b/CHANGELOG.md index a8b6ce1..a9d0c81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,8 +32,15 @@ assembler (`to_dataframes`) moved to the scheme, and the scheme-agnostic writer (`write_relational_tables`) moved to the new `leadforge.render.relational_io` (renamed to avoid a basename clash with the scheme's `relational.py`). `leadforge.render` remains the shared bundle-output envelope -(`relational_io` + `manifests`). The lead-scoring `schema` -specs relocate in a follow-up PR (LTV-Pg). Consumers importing internals (e.g. +(`relational_io` + `manifests`). + +The lifecycle (`b2b_saas_ltv_v1`) entity rows + registries moved from +`leadforge.schema.entities` / `leadforge.schema.relationships` to the new +`leadforge.schemes.lifecycle.entities` / `leadforge.schemes.lifecycle.relationships`, +and a stub `lifecycle` scheme is registered alongside `lead_scoring` +(`available_schemes()` → `("lead_scoring", "lifecycle")`). The lead-scoring +`schema` specs relocate under `leadforge.schemes.lead_scoring` in a follow-up +PR (LTV-Pg.2). Consumers importing internals (e.g. the `leadforge-datasets-private` build scripts) must update to the new paths; the package stays on the `1.x` line (the public contract did not change). diff --git a/CLAUDE.md b/CLAUDE.md index 973d116..83f0ac1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -160,8 +160,10 @@ leadforge/ schemes/ base.py (GenerationScheme protocol + SCHEME_REGISTRY); lead_scoring/ — the lead-scoring scheme: __init__.py (build_world/ write_bundle) + simulation/, mechanisms/, structure/, render/ - (moved in LTV-Pf.1/Pf.2). Lead-scoring schema specs migrate - here in LTV-Pg. See docs/ltv/design.md §2.5. + (moved in LTV-Pf.1/Pf.2); + lifecycle/ — the pLTV scheme (stub): entities.py, relationships.py + (scaffolded in LTV-Pg.1). Lead-scoring schema specs migrate + under lead_scoring/ in LTV-Pg.2. See docs/ltv/design.md §2.5. render/ relational_io.py (write_relational_tables — shared writer), manifests.py # shared bundle-output envelope exposure/ modes.py, filters.py, redaction.py @@ -244,14 +246,18 @@ leadforge/ # Python package root │ └── dictionaries.py # Feature dictionary CSV writer ├── schemes/ # Generation schemes (peer pipelines) + registry │ ├── base.py # GenerationScheme protocol + SCHEME_REGISTRY -│ └── lead_scoring/ # The lead-scoring scheme (LeadScoringScheme) -│ ├── __init__.py # build_world() + write_bundle() -│ ├── structure/ # Hidden world graph (WorldGraph, motifs, sampler) -│ ├── mechanisms/ # Node/edge behavior (policies, hazards, scores, …) -│ ├── simulation/ # World evolution (engine, population, state) -│ └── render/ # Lead-scoring render: snapshots, relational -│ # (to_dataframes), relational_snapshot_safe, tasks -│ # NOTE (LTV-M2 reorg in progress): lead-scoring schema specs split in LTV-Pg. +│ ├── lead_scoring/ # The lead-scoring scheme (LeadScoringScheme) +│ │ ├── __init__.py # build_world() + write_bundle() +│ │ ├── structure/ # Hidden world graph (WorldGraph, motifs, sampler) +│ │ ├── mechanisms/ # Node/edge behavior (policies, hazards, scores, …) +│ │ ├── simulation/ # World evolution (engine, population, state) +│ │ └── render/ # Lead-scoring render: snapshots, relational +│ │ # (to_dataframes), relational_snapshot_safe, tasks +│ └── lifecycle/ # The pLTV scheme (LifecycleScheme — stub until M3–M6) +│ ├── __init__.py # registers the stub scheme +│ ├── entities.py # lifecycle rows + LIFECYCLE_ROW_TYPES +│ └── relationships.py # LIFECYCLE_CONSTRAINTS +│ # NOTE (LTV-M2 reorg in progress): lead-scoring schema specs split in LTV-Pg.2. │ # See docs/ltv/design.md §2.5 for the target layout. ├── render/ # Shared bundle-output envelope │ ├── relational_io.py # write_relational_tables() — shared table writer diff --git a/docs/ltv/roadmap.md b/docs/ltv/roadmap.md index d6e0193..1476ba4 100644 --- a/docs/ltv/roadmap.md +++ b/docs/ltv/roadmap.md @@ -42,7 +42,7 @@ protocol + registry, with the package physically reorganized into |-----------|------------|-----|------------| | `LTV-M0` | Planning + design lock | `LTV-Pa` | #102, #103 (+ scheme reframe) | | `LTV-M1` | Lifecycle schema foundation | `LTV-Pb`, `LTV-Pc` | #104 (Pb) | -| `LTV-M2` | Generation-scheme architecture + physical reorg | `LTV-Pd`, `LTV-Pe`, `LTV-Pf`, `LTV-Pg` | #107 (Pd), #108 (Pe), #109 (Pf.1), #110 (Pf.2) | +| `LTV-M2` | Generation-scheme architecture + physical reorg | `LTV-Pd`, `LTV-Pe`, `LTV-Pf`, `LTV-Pg` | #107 (Pd), #108 (Pe), #109 (Pf.1), #110 (Pf.2), #111 (Pg.1) | | `LTV-M3` | Customer population + lifecycle world | `LTV-Ph`, `LTV-Pi` | | | `LTV-M4` | Lifecycle simulation engine | `LTV-Pj`, `LTV-Pk` | | | `LTV-M5` | Customer snapshots + pLTV targets (both regimes) | `LTV-Pl`, `LTV-Pm` | | @@ -132,11 +132,22 @@ Total: ~19 PRs across 9 milestones. (The lead-scoring `schema` specs split lands with `LTV-Pg`.) - Tests: full suite + hash-determinism green; public API imports unchanged. - Labels: `type: refactor`, `layer: schema`, `layer: simulation`, `layer: render` -- [ ] **`LTV-Pg`** — `refactor: scaffold schemes/lifecycle/ + relocate LTV-Pb/Pc specs`. - Create `schemes/lifecycle/`; move the lifecycle entity rows (from #104) and - the `LTV-Pc` feature/task specs into it; register a stub `LifecycleScheme` - (pipeline methods raise `NotImplementedError` until M3–M6). Split any - remaining shared schema primitives out cleanly. +- [ ] **`LTV-Pg`** — `refactor: scaffold schemes/lifecycle/ + split lead-scoring schema`. + Split into two PRs to keep each tractable: + - [x] **`LTV-Pg.1`** — scaffold `schemes/lifecycle/`: moved the lifecycle + entity rows + `LIFECYCLE_ROW_TYPES`/`LIFECYCLE_TABLE_NAMES` (from #104) into + `schemes/lifecycle/entities.py` and `LIFECYCLE_CONSTRAINTS` into + `schemes/lifecycle/relationships.py`; registered a stub `LifecycleScheme` + (`build_world`/`write_bundle` raise `NotImplementedError` until M3–M6). + Shared primitives (`EntityRowProtocol`, `_empty_df`, `AccountRow`, + `FKConstraint`) stay in `schema/` and are imported. Byte-identical; + full suite green. (**PR #111**) + - [ ] **`LTV-Pg.2`** — split the **lead-scoring** schema: move the + lead-scoring entity rows / `ALL_ROW_TYPES` / `ALL_CONSTRAINTS` / + `LEAD_SNAPSHOT_FEATURES` / task specs into `schemes/lead_scoring/`, leaving + only genuinely shared primitives in `schema/`. (The lifecycle `LTV-Pc` + feature/task specs are authored directly in `schemes/lifecycle/` when M1's + `LTV-Pc` lands.) - Tests: lifecycle registry imports from new home; lead-scoring unaffected. - Labels: `type: refactor`, `layer: schema` diff --git a/leadforge/schema/entities.py b/leadforge/schema/entities.py index 06926c5..21ebe42 100644 --- a/leadforge/schema/entities.py +++ b/leadforge/schema/entities.py @@ -38,7 +38,7 @@ def to_dict(self) -> dict[str, Any]: ... def empty_dataframe(cls) -> pd.DataFrame: ... -def _empty_df(dtype_map: dict[str, str]) -> pd.DataFrame: +def make_empty_dataframe(dtype_map: dict[str, str]) -> pd.DataFrame: """Return a zero-row DataFrame with columns ordered as *dtype_map*.""" return pd.DataFrame({col: pd.array([], dtype=dtype) for col, dtype in dtype_map.items()}) @@ -78,7 +78,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -116,7 +116,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -176,7 +176,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -212,7 +212,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -250,7 +250,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -284,7 +284,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -320,7 +320,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -350,7 +350,7 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) # --------------------------------------------------------------------------- @@ -382,211 +382,15 @@ def to_dict(self) -> dict[str, Any]: @classmethod def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) + return make_empty_dataframe(cls.DTYPE_MAP) -# =========================================================================== -# Lifecycle entity rows (b2b_saas_ltv_v1 — see docs/ltv/design.md) # --------------------------------------------------------------------------- -# These rows belong to the *lifecycle* bundle shape only. They are kept in a -# separate registry (:data:`LIFECYCLE_ROW_TYPES`) and are NOT added to -# :data:`ALL_ROW_TYPES`, so the lead-scoring bundle's table inventory and -# column schemas are completely unchanged. -# -# The lifecycle bundle's ``customers`` and ``subscriptions`` tables are richer -# than the thin lead-scoring :class:`CustomerRow` / :class:`SubscriptionRow` -# (which exist only to record conversion in the procurement world). Rather -# than extend those classes in place — which would change the lead-scoring -# instructor bundle's parquet schema, since ``to_dict()`` emits every field — -# the lifecycle bundle uses the dedicated :class:`CustomerLifecycleRow` / -# :class:`SubscriptionLifecycleRow` classes below. Both deliberately reuse the -# logical table names ``customers`` / ``subscriptions``; the two shapes never -# co-occur in one bundle, and the registries that hold them are disjoint. -# =========================================================================== - - -@dataclass -class CustomerLifecycleRow: - """One row in the lifecycle ``customers`` table. - - Static, set-at-acquisition attributes of a customer. ``opportunity_id`` is - nullable because the lifecycle recipe generates customers **independently** - (no upstream opportunities table); it is reserved for future chained - generation from a lead-scoring bundle's converted leads. - """ - - TABLE_NAME: ClassVar[str] = "customers" - # Column order matches the dataclass field order below; ``opportunity_id`` - # carries a default (nullable) so it must come last in both. - DTYPE_MAP: ClassVar[dict[str, str]] = { - "customer_id": "string", - "account_id": "string", - "customer_start_at": "string", - "initial_plan": "string", - "initial_mrr": "Int64", - "contract_term_months": "Int64", - "csm_rep_id": "string", - "opportunity_id": "string", - } - - customer_id: str - account_id: str - customer_start_at: str - initial_plan: str - initial_mrr: int - contract_term_months: int - csm_rep_id: str - opportunity_id: str | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) - - -@dataclass -class SubscriptionLifecycleRow: - """One row in the lifecycle ``subscriptions`` table. - - Carries the subscription's terminal/dynamic state as of the end of the - simulation. Terminal fields (``subscription_end_at``, ``churn_at``, - ``churn_reason``) are redacted from ``student_public`` bundles per the - lifecycle snapshot-safety contract (see ``docs/ltv/design.md`` §5). - """ - - TABLE_NAME: ClassVar[str] = "subscriptions" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "subscription_id": "string", - "customer_id": "string", - "plan_name": "string", - "subscription_status": "string", - "subscription_start_at": "string", - "current_mrr": "Int64", - "contract_term_months": "Int64", - "renewal_count": "Int64", - "expansion_count": "Int64", - "subscription_end_at": "string", - "churn_at": "string", - "churn_reason": "string", - } - - subscription_id: str - customer_id: str - plan_name: str - subscription_status: str - subscription_start_at: str - current_mrr: int - contract_term_months: int - renewal_count: int - expansion_count: int - subscription_end_at: str | None = None - churn_at: str | None = None - churn_reason: str | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) - - -@dataclass -class SubscriptionEventRow: - """One row in the ``subscription_events`` table — a lifecycle state change.""" - - TABLE_NAME: ClassVar[str] = "subscription_events" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "event_id": "string", - "subscription_id": "string", - "customer_id": "string", - "event_timestamp": "string", - "event_type": "string", - "mrr_before": "Int64", - "mrr_after": "Int64", - "contract_term_months_new": "Int64", - } - - event_id: str - subscription_id: str - customer_id: str - event_timestamp: str - event_type: str - mrr_before: int - mrr_after: int - contract_term_months_new: int | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) - - -@dataclass -class HealthSignalRow: - """One row in the ``health_signals`` table — weekly product-usage telemetry.""" - - TABLE_NAME: ClassVar[str] = "health_signals" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "signal_id": "string", - "customer_id": "string", - "period_start": "string", - "active_users": "Int64", - "feature_depth_score": "Float64", - "support_tickets": "Int64", - "nps_score": "Int64", - } - - signal_id: str - customer_id: str - period_start: str - active_users: int - feature_depth_score: float - support_tickets: int - nps_score: int | None = None - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) - - -@dataclass -class InvoiceRow: - """One row in the ``invoices`` table — monthly billing; the unit of pLTV value.""" - - TABLE_NAME: ClassVar[str] = "invoices" - DTYPE_MAP: ClassVar[dict[str, str]] = { - "invoice_id": "string", - "customer_id": "string", - "invoice_date": "string", - "amount_usd": "Int64", - "payment_status": "string", - } - - invoice_id: str - customer_id: str - invoice_date: str - amount_usd: int - payment_status: str - - def to_dict(self) -> dict[str, Any]: - return {f.name: getattr(self, f.name) for f in fields(self)} - - @classmethod - def empty_dataframe(cls) -> pd.DataFrame: - return _empty_df(cls.DTYPE_MAP) - - -# --------------------------------------------------------------------------- -# Registry +# Registry (lead-scoring catalog) # --------------------------------------------------------------------------- +# Lifecycle (b2b_saas_ltv_v1) entity rows + their LIFECYCLE_ROW_TYPES registry +# now live in ``leadforge.schemes.lifecycle.entities`` (moved in LTV-Pg). +# ``AccountRow`` above is shared and reused by the lifecycle scheme. ALL_ROW_TYPES: tuple[type[EntityRowProtocol], ...] = ( AccountRow, @@ -601,17 +405,3 @@ def empty_dataframe(cls) -> pd.DataFrame: ) TABLE_NAMES: tuple[str, ...] = tuple(cls.TABLE_NAME for cls in ALL_ROW_TYPES) - -# Lifecycle (b2b_saas_ltv_v1) bundle table inventory. Kept separate from -# ALL_ROW_TYPES so the lead-scoring bundle is unaffected. AccountRow is shared -# (reused unchanged); customers/subscriptions use the richer lifecycle classes. -LIFECYCLE_ROW_TYPES: tuple[type[EntityRowProtocol], ...] = ( - AccountRow, - CustomerLifecycleRow, - SubscriptionLifecycleRow, - SubscriptionEventRow, - HealthSignalRow, - InvoiceRow, -) - -LIFECYCLE_TABLE_NAMES: tuple[str, ...] = tuple(cls.TABLE_NAME for cls in LIFECYCLE_ROW_TYPES) diff --git a/leadforge/schema/relationships.py b/leadforge/schema/relationships.py index de5fca1..4ec501c 100644 --- a/leadforge/schema/relationships.py +++ b/leadforge/schema/relationships.py @@ -41,20 +41,9 @@ class FKConstraint: ) -# Lifecycle (b2b_saas_ltv_v1) FK constraints — see docs/ltv/design.md. -# Kept separate from ALL_CONSTRAINTS so the lead-scoring model is unchanged. -# The lifecycle ``customers`` table links only to ``accounts`` (independent -# generation, no ``opportunities`` table), so there is no customer→opportunity -# FK here despite the nullable ``opportunity_id`` column being reserved for -# future chained generation. -LIFECYCLE_CONSTRAINTS: tuple[FKConstraint, ...] = ( - FKConstraint("customers", "account_id", "accounts", "account_id"), - FKConstraint("subscriptions", "customer_id", "customers", "customer_id"), - FKConstraint("subscription_events", "subscription_id", "subscriptions", "subscription_id"), - FKConstraint("subscription_events", "customer_id", "customers", "customer_id"), - FKConstraint("health_signals", "customer_id", "customers", "customer_id"), - FKConstraint("invoices", "customer_id", "customers", "customer_id"), -) +# Lifecycle (b2b_saas_ltv_v1) FK constraints (LIFECYCLE_CONSTRAINTS) now live in +# ``leadforge.schemes.lifecycle.relationships`` (moved in LTV-Pg). They reuse +# the shared FKConstraint primitive above. def validate_fk( diff --git a/leadforge/schemes/__init__.py b/leadforge/schemes/__init__.py index 2d25f2d..1c91391 100644 --- a/leadforge/schemes/__init__.py +++ b/leadforge/schemes/__init__.py @@ -9,6 +9,7 @@ # Import built-in scheme modules for their registration side effects. from leadforge.schemes import lead_scoring as _lead_scoring # noqa: F401 +from leadforge.schemes import lifecycle as _lifecycle # noqa: F401 from leadforge.schemes.base import ( SCHEME_REGISTRY, GenerationScheme, diff --git a/leadforge/schemes/lifecycle/__init__.py b/leadforge/schemes/lifecycle/__init__.py new file mode 100644 index 0000000..f9a8380 --- /dev/null +++ b/leadforge/schemes/lifecycle/__init__.py @@ -0,0 +1,50 @@ +"""The ``lifecycle`` generation scheme (``b2b_saas_ltv_v1``) — scaffold. + +Registers the second peer scheme alongside ``lead_scoring``. Its entity rows +and FK constraints live here (``entities`` / ``relationships``); the pipeline +itself (``build_world`` / ``write_bundle``) is built out across LTV-M3…M6 and +currently raises :class:`NotImplementedError`. Registering the stub now lets +the registry, recipe ``scheme:`` resolution, and tests treat lifecycle as a +first-class peer before its internals exist. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from leadforge.schemes.base import register_scheme + +if TYPE_CHECKING: + from leadforge.core.models import GenerationConfig, WorldBundle + from leadforge.narrative.spec import NarrativeSpec + +_NOT_IMPLEMENTED = ( + "the lifecycle (b2b_saas_ltv_v1) scheme is not implemented yet; " + "its pipeline is built across LTV-M3…M6" +) + + +class LifecycleScheme: + """Stub for the customer-lifetime-value (pLTV) generation pipeline.""" + + name = "lifecycle" + + def build_world( + self, + config: GenerationConfig, + narrative: NarrativeSpec, + **options: Any, + ) -> WorldBundle: + raise NotImplementedError(_NOT_IMPLEMENTED) + + def write_bundle( + self, + bundle: WorldBundle, + path: str, + generation_timestamp: str | None = None, + ) -> None: + raise NotImplementedError(_NOT_IMPLEMENTED) + + +LIFECYCLE_SCHEME = LifecycleScheme() +register_scheme(LIFECYCLE_SCHEME) diff --git a/leadforge/schemes/lifecycle/entities.py b/leadforge/schemes/lifecycle/entities.py new file mode 100644 index 0000000..9dfc973 --- /dev/null +++ b/leadforge/schemes/lifecycle/entities.py @@ -0,0 +1,225 @@ +"""Lifecycle (``b2b_saas_ltv_v1``) entity rows. + +These rows belong to the *lifecycle* bundle shape only. They are kept in this +scheme package (registry :data:`LIFECYCLE_ROW_TYPES`) and are entirely separate +from the lead-scoring catalog, so the lead-scoring bundle's table inventory and +column schemas are unaffected. + +The lifecycle bundle's ``customers`` and ``subscriptions`` tables are richer +than the thin lead-scoring :class:`~leadforge.schema.entities.CustomerRow` / +:class:`~leadforge.schema.entities.SubscriptionRow` (which exist only to record +conversion in the procurement world). Rather than extend those classes in +place — which would change the lead-scoring instructor bundle's parquet schema, +since ``to_dict()`` emits every field — the lifecycle bundle uses the dedicated +:class:`CustomerLifecycleRow` / :class:`SubscriptionLifecycleRow` classes below. +Both deliberately reuse the logical table names ``customers`` / ``subscriptions``; +the two shapes never co-occur in one bundle. + +``AccountRow`` is shared across schemes and is imported from the shared schema +package (``accounts`` is the same entity in both the lead-scoring and lifecycle +worlds). +""" + +from __future__ import annotations + +from dataclasses import dataclass, fields +from typing import Any, ClassVar + +import pandas as pd + +from leadforge.schema.entities import AccountRow, EntityRowProtocol, make_empty_dataframe + + +@dataclass +class CustomerLifecycleRow: + """One row in the lifecycle ``customers`` table. + + Static, set-at-acquisition attributes of a customer. ``opportunity_id`` is + nullable because the lifecycle recipe generates customers **independently** + (no upstream opportunities table); it is reserved for future chained + generation from a lead-scoring bundle's converted leads. + """ + + TABLE_NAME: ClassVar[str] = "customers" + # Column order matches the dataclass field order below; ``opportunity_id`` + # carries a default (nullable) so it must come last in both. + DTYPE_MAP: ClassVar[dict[str, str]] = { + "customer_id": "string", + "account_id": "string", + "customer_start_at": "string", + "initial_plan": "string", + "initial_mrr": "Int64", + "contract_term_months": "Int64", + "csm_rep_id": "string", + "opportunity_id": "string", + } + + customer_id: str + account_id: str + customer_start_at: str + initial_plan: str + initial_mrr: int + contract_term_months: int + csm_rep_id: str + opportunity_id: str | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +@dataclass +class SubscriptionLifecycleRow: + """One row in the lifecycle ``subscriptions`` table. + + Carries the subscription's terminal/dynamic state as of the end of the + simulation. Terminal fields (``subscription_end_at``, ``churn_at``, + ``churn_reason``) are redacted from ``student_public`` bundles per the + lifecycle snapshot-safety contract (see ``docs/ltv/design.md`` §5). + """ + + TABLE_NAME: ClassVar[str] = "subscriptions" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "subscription_id": "string", + "customer_id": "string", + "plan_name": "string", + "subscription_status": "string", + "subscription_start_at": "string", + "current_mrr": "Int64", + "contract_term_months": "Int64", + "renewal_count": "Int64", + "expansion_count": "Int64", + "subscription_end_at": "string", + "churn_at": "string", + "churn_reason": "string", + } + + subscription_id: str + customer_id: str + plan_name: str + subscription_status: str + subscription_start_at: str + current_mrr: int + contract_term_months: int + renewal_count: int + expansion_count: int + subscription_end_at: str | None = None + churn_at: str | None = None + churn_reason: str | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +@dataclass +class SubscriptionEventRow: + """One row in the ``subscription_events`` table — a lifecycle state change.""" + + TABLE_NAME: ClassVar[str] = "subscription_events" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "event_id": "string", + "subscription_id": "string", + "customer_id": "string", + "event_timestamp": "string", + "event_type": "string", + "mrr_before": "Int64", + "mrr_after": "Int64", + "contract_term_months_new": "Int64", + } + + event_id: str + subscription_id: str + customer_id: str + event_timestamp: str + event_type: str + mrr_before: int + mrr_after: int + contract_term_months_new: int | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +@dataclass +class HealthSignalRow: + """One row in the ``health_signals`` table — weekly product-usage telemetry.""" + + TABLE_NAME: ClassVar[str] = "health_signals" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "signal_id": "string", + "customer_id": "string", + "period_start": "string", + "active_users": "Int64", + "feature_depth_score": "Float64", + "support_tickets": "Int64", + "nps_score": "Int64", + } + + signal_id: str + customer_id: str + period_start: str + active_users: int + feature_depth_score: float + support_tickets: int + nps_score: int | None = None + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +@dataclass +class InvoiceRow: + """One row in the ``invoices`` table — monthly billing; the unit of pLTV value.""" + + TABLE_NAME: ClassVar[str] = "invoices" + DTYPE_MAP: ClassVar[dict[str, str]] = { + "invoice_id": "string", + "customer_id": "string", + "invoice_date": "string", + "amount_usd": "Int64", + "payment_status": "string", + } + + invoice_id: str + customer_id: str + invoice_date: str + amount_usd: int + payment_status: str + + def to_dict(self) -> dict[str, Any]: + return {f.name: getattr(self, f.name) for f in fields(self)} + + @classmethod + def empty_dataframe(cls) -> pd.DataFrame: + return make_empty_dataframe(cls.DTYPE_MAP) + + +# --------------------------------------------------------------------------- +# Lifecycle bundle table inventory. AccountRow is shared (reused unchanged); +# customers/subscriptions use the richer lifecycle classes above. +# --------------------------------------------------------------------------- +LIFECYCLE_ROW_TYPES: tuple[type[EntityRowProtocol], ...] = ( + AccountRow, + CustomerLifecycleRow, + SubscriptionLifecycleRow, + SubscriptionEventRow, + HealthSignalRow, + InvoiceRow, +) + +LIFECYCLE_TABLE_NAMES: tuple[str, ...] = tuple(cls.TABLE_NAME for cls in LIFECYCLE_ROW_TYPES) diff --git a/leadforge/schemes/lifecycle/relationships.py b/leadforge/schemes/lifecycle/relationships.py new file mode 100644 index 0000000..12f9a96 --- /dev/null +++ b/leadforge/schemes/lifecycle/relationships.py @@ -0,0 +1,22 @@ +"""Lifecycle (``b2b_saas_ltv_v1``) foreign-key constraints. + +Kept in the lifecycle scheme package, separate from the lead-scoring +``ALL_CONSTRAINTS``. Reuses the shared :class:`~leadforge.schema.relationships.FKConstraint` +primitive. The lifecycle ``customers`` table links only to ``accounts`` +(independent generation, no ``opportunities`` table), so there is no +customer→opportunity FK despite the nullable ``opportunity_id`` column being +reserved for future chained generation. +""" + +from __future__ import annotations + +from leadforge.schema.relationships import FKConstraint + +LIFECYCLE_CONSTRAINTS: tuple[FKConstraint, ...] = ( + FKConstraint("customers", "account_id", "accounts", "account_id"), + FKConstraint("subscriptions", "customer_id", "customers", "customer_id"), + FKConstraint("subscription_events", "subscription_id", "subscriptions", "subscription_id"), + FKConstraint("subscription_events", "customer_id", "customers", "customer_id"), + FKConstraint("health_signals", "customer_id", "customers", "customer_id"), + FKConstraint("invoices", "customer_id", "customers", "customer_id"), +) diff --git a/tests/schemes/lifecycle/__init__.py b/tests/schemes/lifecycle/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/schema/test_lifecycle_entities.py b/tests/schemes/lifecycle/test_entities.py similarity index 96% rename from tests/schema/test_lifecycle_entities.py rename to tests/schemes/lifecycle/test_entities.py index 5a2de89..41f0df7 100644 --- a/tests/schema/test_lifecycle_entities.py +++ b/tests/schemes/lifecycle/test_entities.py @@ -11,24 +11,19 @@ import pytest from leadforge.core.ids import ID_PREFIXES, make_id -from leadforge.schema.entities import ( - ALL_ROW_TYPES, +from leadforge.schema.entities import ALL_ROW_TYPES, TABLE_NAMES, AccountRow +from leadforge.schema.relationships import ALL_CONSTRAINTS, FKConstraint +from leadforge.schema.tables import read_parquet, write_parquet +from leadforge.schemes.lifecycle.entities import ( LIFECYCLE_ROW_TYPES, LIFECYCLE_TABLE_NAMES, - TABLE_NAMES, - AccountRow, CustomerLifecycleRow, HealthSignalRow, InvoiceRow, SubscriptionEventRow, SubscriptionLifecycleRow, ) -from leadforge.schema.relationships import ( - ALL_CONSTRAINTS, - LIFECYCLE_CONSTRAINTS, - FKConstraint, -) -from leadforge.schema.tables import read_parquet, write_parquet +from leadforge.schemes.lifecycle.relationships import LIFECYCLE_CONSTRAINTS # --------------------------------------------------------------------------- # Row factories diff --git a/tests/schemes/test_registry.py b/tests/schemes/test_registry.py index ad129e7..633f239 100644 --- a/tests/schemes/test_registry.py +++ b/tests/schemes/test_registry.py @@ -27,6 +27,24 @@ def test_lead_scoring_registered() -> None: assert get_scheme("lead_scoring") is LEAD_SCORING_SCHEME +def test_lifecycle_scheme_registered() -> None: + from leadforge.schemes.lifecycle import LIFECYCLE_SCHEME + + assert "lifecycle" in available_schemes() + assert get_scheme("lifecycle") is LIFECYCLE_SCHEME + assert LIFECYCLE_SCHEME.name == "lifecycle" + + +def test_lifecycle_scheme_is_a_stub() -> None: + # Pipeline not implemented yet (built across LTV-M3…M6); calling it must + # fail loudly rather than silently doing nothing. + sch = get_scheme("lifecycle") + with pytest.raises(NotImplementedError): + sch.build_world(None, None) # type: ignore[arg-type] + with pytest.raises(NotImplementedError): + sch.write_bundle(None, "out") # type: ignore[arg-type] + + def test_lead_scoring_scheme_name() -> None: assert LEAD_SCORING_SCHEME.name == "lead_scoring"