From 90a6875d4135e535cb8e53b2878910c4aed3b4da Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Mon, 27 Apr 2026 16:03:57 +0300 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20Milestone=205=20=E2=80=94=20populat?= =?UTF-8?q?ion=20generation=20and=20latent=20state=20initialisation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements build_population() in leadforge/simulation/population.py: - AccountRow generation: industry, region, employee/revenue/maturity bands, account created_at spread 30-730 days before world base date - ContactRow generation: persona-driven title/role/buyer_role, conditional account FK, contact created_at anchored to parent account - LeadRow generation: GTM-weighted lead_source, rep assignment from internal pool, lead_created_at within 30-day base window; initial stage = mql - LatentState: 8 hidden traits across 3 entity types, all in [0,1], sampled from clipped Gaussians with motif-family-aware mean biases - FK invariant: lead.account_id always equals contact.account_id - All randomness via RNGRoot named substreams — fully deterministic 26 tests: counts, determinism, FK integrity, latent range/completeness, motif bias properties (fit_dominant vs buying_committee_friction), and observable field validity. Co-Authored-By: Claude Sonnet 4.6 --- .agent-plan.md | 33 ++- leadforge/simulation/population.py | 388 ++++++++++++++++++++++++++++ tests/simulation/__init__.py | 0 tests/simulation/test_population.py | 351 +++++++++++++++++++++++++ 4 files changed, 761 insertions(+), 11 deletions(-) create mode 100644 leadforge/simulation/population.py create mode 100644 tests/simulation/__init__.py create mode 100644 tests/simulation/test_population.py diff --git a/.agent-plan.md b/.agent-plan.md index 0cc6073..317a687 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -6,33 +6,44 @@ ## Current System State -**v0.3.0 in progress — Milestone 4 complete (PR open).** Hidden world graph fully implemented: -typed node system, DAG-validated WorldGraph, all 5 v1 motif families, stochastic rewiring, and -graph sampler. 327 tests passing. +**v0.3.0 in progress — Milestone 5 complete (PR open).** Population generation fully +implemented: accounts, contacts, leads with all observable fields, full latent state +(8 traits across 3 entity types), motif-family-aware bias, and FK integrity guaranteed. +358 tests passing. --- -## Active Task Breakdown — Milestone 5: Population Generation (v0.3.0) +## Active Task Breakdown — Milestone 6: Mechanism Layer v1 (v0.3.0) -Goal: Generate the base commercial world population before dynamic events begin. +Goal: Implement the static and dynamic mechanisms that drive simulation behavior. -- [ ] **1. Account generation** (`simulation/population.py`) — accounts with latent traits -- [ ] **2. Contact generation** — contacts conditional on account properties -- [ ] **3. Lead creation** — leads anchored to contacts/accounts -- [ ] **4. Latent-state initialisation** — sample core latent traits tied to graph/motif +- [ ] **1. Base mechanism interface** (`mechanisms/base.py`) +- [ ] **2. Static mechanisms** — categorical, ordinal, bounded-numeric draws (`mechanisms/static.py`) +- [ ] **3. Transition mechanisms** — lead-stage advancement logic (`mechanisms/transitions.py`) +- [ ] **4. Score/hazard mechanisms** — latent-to-observable scoring, conversion hazard (`mechanisms/scores.py`, `mechanisms/hazards.py`) +- [ ] **5. Measurement mechanisms** — noisy proxy observation of latent traits (`mechanisms/measurement.py`) --- ## Context Pointers -- Milestone 5 scope: `docs/leadforge_implementation_plan.md` §8 "Milestone 5" -- Structure spec: `docs/leadforge_architecture_spec.md` §11 +- Milestone 6 scope: `docs/leadforge_implementation_plan.md` §9 "Milestone 6" +- Mechanism types: `docs/leadforge_architecture_spec.md` §10 "Mechanism layer" - Latent variables: `docs/leadforge_architecture_spec.md` §9 --- ## Completed Phases +### Milestone 5 — Population Generation ✓ (v0.3.0 in PR) +- `leadforge/simulation/population.py`: `build_population()` — accounts (3 latent traits), + contacts (4 latent traits, conditional on account), leads (1 latent trait, FK-consistent), + `LatentState` and `PopulationResult` output types +- Motif-family latent biases (`_MOTIF_LATENT_BIAS`) linking world structure to population +- `tests/simulation/test_population.py`: 26 tests covering counts, determinism, FK integrity, + latent value ranges, trait completeness, motif bias properties, and observable field validity +- Total: 358 tests passing + ### Milestone 4 — World Structure Layer ✓ (v0.3.0 in PR) - `leadforge/structure/node_types.py`: `NodeType` enum (9 categories); `ROOT_ELIGIBLE`, `REQUIRES_PARENT`, `LEAF_ONLY` constraint sets diff --git a/leadforge/simulation/population.py b/leadforge/simulation/population.py new file mode 100644 index 0000000..3b19d18 --- /dev/null +++ b/leadforge/simulation/population.py @@ -0,0 +1,388 @@ +"""Population generation — accounts, contacts, leads, and latent states. + +:func:`build_population` is the single entry point consumed by the +simulation layer. All randomness derives from named :class:`~leadforge.core.rng.RNGRoot` +substreams so the full population is deterministic given ``config.seed``. + +Latent state +------------ +Each entity carries hidden ground-truth traits that drive simulation +mechanics but are **never** directly exposed in ``student_public`` mode: + +- **account** — ``latent_account_fit``, ``latent_budget_readiness``, + ``latent_process_maturity`` +- **contact** — ``latent_problem_awareness``, ``latent_contact_authority``, + ``latent_responsiveness``, ``latent_engagement_propensity`` +- **lead** — ``latent_sales_friction`` + +All values are floats in [0, 1] sampled from a clipped Gaussian. The +active motif family shifts the mean of selected traits to create a +structurally coherent world (e.g. ``fit_dominant`` raises the mean of +``latent_account_fit``). +""" + +from __future__ import annotations + +import random +from dataclasses import dataclass, field +from datetime import date, timedelta +from typing import TYPE_CHECKING + +from leadforge.core.ids import ID_PREFIXES, make_id +from leadforge.core.models import GenerationConfig +from leadforge.core.rng import RNGRoot +from leadforge.schema.entities import AccountRow, ContactRow, LeadRow + +if TYPE_CHECKING: + from leadforge.narrative.spec import NarrativeSpec + from leadforge.structure.graph import WorldGraph + + +# --------------------------------------------------------------------------- +# Output types +# --------------------------------------------------------------------------- + + +@dataclass +class LatentState: + """Hidden ground-truth latent variables for all entities in one world. + + Each mapping is ``entity_id → {trait_name: float_in_[0,1]}``. + """ + + account_latents: dict[str, dict[str, float]] = field(default_factory=dict) + contact_latents: dict[str, dict[str, float]] = field(default_factory=dict) + lead_latents: dict[str, dict[str, float]] = field(default_factory=dict) + + +@dataclass +class PopulationResult: + """Output of one :func:`build_population` call.""" + + accounts: list[AccountRow] + contacts: list[ContactRow] + leads: list[LeadRow] + latent_state: LatentState + + +# --------------------------------------------------------------------------- +# Internal constants +# --------------------------------------------------------------------------- + +_EMPLOYEE_BANDS = ("200-499", "500-999", "1000-1999", "2000+") +_EMPLOYEE_BAND_WEIGHTS = (0.35, 0.35, 0.20, 0.10) + +_REVENUE_BANDS = ("$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+") +_REVENUE_BAND_WEIGHTS = (0.25, 0.40, 0.25, 0.10) + +_PROCESS_MATURITY_BANDS = ("low", "medium", "high") +_PROCESS_MATURITY_BAND_WEIGHTS = (0.30, 0.45, 0.25) +_PROCESS_MATURITY_MEANS = {"low": 0.25, "medium": 0.50, "high": 0.75} + +_SENIORITY_LEVELS = ("individual_contributor", "manager", "director", "vp", "c_suite") +_SENIORITY_WEIGHTS = (0.25, 0.30, 0.25, 0.15, 0.05) + +_EMAIL_DOMAIN_TYPES = ("corporate", "personal", "unknown") +_EMAIL_DOMAIN_WEIGHTS = (0.80, 0.12, 0.08) + +# Base reference date: all leads are created within a 30-day window starting here. +_WORLD_BASE_DATE = date(2024, 1, 1) + +# Number of internal sales-rep entities used for lead assignment. +_N_REPS = 10 + +# Motif-family-specific additive bias on the default 0.50 latent mean. +# Only traits explicitly listed are shifted; all others stay at 0.50. +_MOTIF_LATENT_BIAS: dict[str, dict[str, float]] = { + "fit_dominant": { + "latent_account_fit": 0.10, + "latent_budget_readiness": 0.05, + }, + "intent_dominant": { + "latent_engagement_propensity": 0.12, + "latent_problem_awareness": 0.10, + }, + "sales_execution_sensitive": { + "latent_sales_friction": 0.12, + "latent_responsiveness": -0.08, + }, + "demo_trial_mediated": { + "latent_engagement_propensity": 0.08, + "latent_problem_awareness": 0.06, + }, + "buying_committee_friction": { + "latent_contact_authority": -0.10, + "latent_sales_friction": 0.15, + }, +} + +# Map GTM channel names → GtmMotionSpec attribute names. +_CHANNEL_TO_SHARE_ATTR: dict[str, str] = { + "inbound_marketing": "inbound_share", + "sdr_outbound": "outbound_share", + "partner_referral": "partner_share", +} + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + + +def build_population( + config: GenerationConfig, + narrative: NarrativeSpec, + world_graph: WorldGraph, +) -> PopulationResult: + """Generate accounts, contacts, leads, and their latent states. + + All randomness is derived from named substreams of ``RNGRoot(config.seed)`` + so the result is fully deterministic for a given ``(config, world_graph)``. + + Args: + config: Fully resolved generation configuration (counts, seed, etc.). + narrative: Parsed narrative spec providing ICP industries, geographies, + personas, and GTM channel mix. + world_graph: The sampled hidden world graph; its ``motif_family`` is used + to apply latent-trait mean biases that make the world structurally + coherent. + + Returns: + A :class:`PopulationResult` containing the three entity lists and the + full :class:`LatentState`. + """ + root = RNGRoot(config.seed) + bias = _MOTIF_LATENT_BIAS.get(world_graph.motif_family, {}) + + accounts, acct_latents = _generate_accounts( + n=config.n_accounts, + narrative=narrative, + bias=bias, + rng=root.child("population_accounts"), + ) + + contacts, cont_latents = _generate_contacts( + n=config.n_contacts, + accounts=accounts, + narrative=narrative, + bias=bias, + rng=root.child("population_contacts"), + ) + + leads, lead_latents = _generate_leads( + n=config.n_leads, + contacts=contacts, + narrative=narrative, + bias=bias, + rng=root.child("population_leads"), + ) + + return PopulationResult( + accounts=accounts, + contacts=contacts, + leads=leads, + latent_state=LatentState( + account_latents=acct_latents, + contact_latents=cont_latents, + lead_latents=lead_latents, + ), + ) + + +# --------------------------------------------------------------------------- +# Account generation +# --------------------------------------------------------------------------- + + +def _generate_accounts( + n: int, + narrative: NarrativeSpec, + bias: dict[str, float], + rng: random.Random, +) -> tuple[list[AccountRow], dict[str, dict[str, float]]]: + industries = list(narrative.market.icp_industries) + geographies = list(narrative.market.geographies) + + rows: list[AccountRow] = [] + latents: dict[str, dict[str, float]] = {} + + for i in range(1, n + 1): + acct_id = make_id(ID_PREFIXES["account"], i) + + industry = rng.choice(industries) + region = rng.choice(geographies) + employee_band = rng.choices(_EMPLOYEE_BANDS, weights=_EMPLOYEE_BAND_WEIGHTS, k=1)[0] + revenue_band = rng.choices(_REVENUE_BANDS, weights=_REVENUE_BAND_WEIGHTS, k=1)[0] + maturity_band = rng.choices( + _PROCESS_MATURITY_BANDS, weights=_PROCESS_MATURITY_BAND_WEIGHTS, k=1 + )[0] + + days_before = rng.randint(30, 730) + created_at = (_WORLD_BASE_DATE - timedelta(days=days_before)).isoformat() + + rows.append( + AccountRow( + account_id=acct_id, + company_name=f"Company {acct_id}", + industry=industry, + region=region, + employee_band=employee_band, + estimated_revenue_band=revenue_band, + process_maturity_band=maturity_band, + created_at=created_at, + ) + ) + latents[acct_id] = { + "latent_account_fit": _sample_latent(rng, 0.50 + bias.get("latent_account_fit", 0.0)), + "latent_budget_readiness": _sample_latent( + rng, 0.50 + bias.get("latent_budget_readiness", 0.0) + ), + # Correlated with observable band; not directly biased by motif. + "latent_process_maturity": _sample_latent( + rng, _PROCESS_MATURITY_MEANS[maturity_band], std=0.15 + ), + } + + return rows, latents + + +# --------------------------------------------------------------------------- +# Contact generation +# --------------------------------------------------------------------------- + + +def _generate_contacts( + n: int, + accounts: list[AccountRow], + narrative: NarrativeSpec, + bias: dict[str, float], + rng: random.Random, +) -> tuple[list[ContactRow], dict[str, dict[str, float]]]: + personas = list(narrative.personas) + + rows: list[ContactRow] = [] + latents: dict[str, dict[str, float]] = {} + + for i in range(1, n + 1): + cnt_id = make_id(ID_PREFIXES["contact"], i) + account = rng.choice(accounts) + + persona = rng.choice(personas) + job_title = rng.choice(list(persona.title_variants)) + role_function = persona.role + buyer_role = persona.decision_authority + seniority = rng.choices(_SENIORITY_LEVELS, weights=_SENIORITY_WEIGHTS, k=1)[0] + email_domain = rng.choices(_EMAIL_DOMAIN_TYPES, weights=_EMAIL_DOMAIN_WEIGHTS, k=1)[0] + + # Contacts are created at or shortly after their account. + acct_date = date.fromisoformat(account.created_at) + days_after = rng.randint(0, 30) + created_at = (acct_date + timedelta(days=days_after)).isoformat() + + rows.append( + ContactRow( + contact_id=cnt_id, + account_id=account.account_id, + job_title=job_title, + role_function=role_function, + seniority=seniority, + buyer_role=buyer_role, + email_domain_type=email_domain, + created_at=created_at, + ) + ) + latents[cnt_id] = { + "latent_problem_awareness": _sample_latent( + rng, 0.50 + bias.get("latent_problem_awareness", 0.0) + ), + "latent_contact_authority": _sample_latent( + rng, 0.50 + bias.get("latent_contact_authority", 0.0) + ), + "latent_responsiveness": _sample_latent( + rng, 0.50 + bias.get("latent_responsiveness", 0.0) + ), + "latent_engagement_propensity": _sample_latent( + rng, 0.50 + bias.get("latent_engagement_propensity", 0.0) + ), + } + + return rows, latents + + +# --------------------------------------------------------------------------- +# Lead generation +# --------------------------------------------------------------------------- + + +def _generate_leads( + n: int, + contacts: list[ContactRow], + narrative: NarrativeSpec, + bias: dict[str, float], + rng: random.Random, +) -> tuple[list[LeadRow], dict[str, dict[str, float]]]: + channels, channel_weights = _channel_weights(narrative) + rep_ids = [make_id(ID_PREFIXES["rep"], i) for i in range(1, _N_REPS + 1)] + + rows: list[LeadRow] = [] + latents: dict[str, dict[str, float]] = {} + + for i in range(1, n + 1): + lead_id = make_id(ID_PREFIXES["lead"], i) + contact = rng.choice(contacts) + + lead_source = rng.choices(channels, weights=channel_weights, k=1)[0] + days_offset = rng.randint(0, 29) + lead_created_at = (_WORLD_BASE_DATE + timedelta(days=days_offset)).isoformat() + owner_rep_id = rng.choice(rep_ids) + + rows.append( + LeadRow( + lead_id=lead_id, + contact_id=contact.contact_id, + account_id=contact.account_id, + lead_created_at=lead_created_at, + lead_source=lead_source, + first_touch_channel=lead_source, + current_stage="mql", + owner_rep_id=owner_rep_id, + is_mql=True, + is_sql=False, + converted_within_90_days=False, + conversion_timestamp=None, + ) + ) + latents[lead_id] = { + "latent_sales_friction": _sample_latent( + rng, 0.50 + bias.get("latent_sales_friction", 0.0) + ), + } + + return rows, latents + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _sample_latent(rng: random.Random, mean: float = 0.50, std: float = 0.20) -> float: + """Draw a latent trait value in [0, 1] from a clipped Gaussian.""" + mean = max(0.10, min(0.90, mean)) + return max(0.0, min(1.0, rng.gauss(mean, std))) + + +def _channel_weights(narrative: NarrativeSpec) -> tuple[list[str], list[float]]: + """Return (channels, weights) lists ordered as in the GTM spec.""" + gtm = narrative.gtm_motion + channels: list[str] = [] + weights: list[float] = [] + for ch in gtm.channels: + attr = _CHANNEL_TO_SHARE_ATTR.get(ch) + channels.append(ch) + weights.append(float(getattr(gtm, attr)) if attr else 1.0 / len(gtm.channels)) + # Normalise in case shares don't sum to exactly 1.0 + total = sum(weights) + if total > 0: + weights = [w / total for w in weights] + return channels, weights diff --git a/tests/simulation/__init__.py b/tests/simulation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/simulation/test_population.py b/tests/simulation/test_population.py new file mode 100644 index 0000000..61308d5 --- /dev/null +++ b/tests/simulation/test_population.py @@ -0,0 +1,351 @@ +"""Tests for leadforge.simulation.population — build_population.""" + +from __future__ import annotations + +from leadforge.api.generator import Generator +from leadforge.core.ids import ID_PREFIXES, make_id +from leadforge.core.models import GenerationConfig +from leadforge.simulation.population import ( + _N_REPS, + PopulationResult, + build_population, +) +from leadforge.structure.sampler import sample_hidden_graph + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +_SEED = 42 +_N_ACCOUNTS = 50 +_N_CONTACTS = 120 +_N_LEADS = 200 + + +def _make_result(seed: int = _SEED, motif: str | None = None) -> PopulationResult: + config = GenerationConfig( + seed=seed, + n_accounts=_N_ACCOUNTS, + n_contacts=_N_CONTACTS, + n_leads=_N_LEADS, + ) + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=seed) + narrative = gen.world_spec.narrative + assert narrative is not None + graph = sample_hidden_graph(seed=seed, motif_family_name=motif) + return build_population(config, narrative, graph) + + +# --------------------------------------------------------------------------- +# Counts +# --------------------------------------------------------------------------- + + +def test_population_counts() -> None: + result = _make_result() + assert len(result.accounts) == _N_ACCOUNTS + assert len(result.contacts) == _N_CONTACTS + assert len(result.leads) == _N_LEADS + + +def test_latent_state_counts() -> None: + result = _make_result() + assert len(result.latent_state.account_latents) == _N_ACCOUNTS + assert len(result.latent_state.contact_latents) == _N_CONTACTS + assert len(result.latent_state.lead_latents) == _N_LEADS + + +# --------------------------------------------------------------------------- +# Determinism +# --------------------------------------------------------------------------- + + +def test_build_population_is_deterministic() -> None: + r1 = _make_result(seed=7) + r2 = _make_result(seed=7) + assert [a.account_id for a in r1.accounts] == [a.account_id for a in r2.accounts] + assert [c.contact_id for c in r1.contacts] == [c.contact_id for c in r2.contacts] + assert [lead.lead_id for lead in r1.leads] == [lead.lead_id for lead in r2.leads] + assert r1.latent_state.account_latents == r2.latent_state.account_latents + assert r1.latent_state.contact_latents == r2.latent_state.contact_latents + assert r1.latent_state.lead_latents == r2.latent_state.lead_latents + + +def test_different_seeds_give_different_results() -> None: + r1 = _make_result(seed=1) + r2 = _make_result(seed=2) + assert r1.latent_state.account_latents != r2.latent_state.account_latents + + +# --------------------------------------------------------------------------- +# Entity IDs +# --------------------------------------------------------------------------- + + +def test_account_ids_are_sequential_and_unique() -> None: + result = _make_result() + ids = [a.account_id for a in result.accounts] + expected = [make_id(ID_PREFIXES["account"], i) for i in range(1, _N_ACCOUNTS + 1)] + assert ids == expected + + +def test_contact_ids_are_sequential_and_unique() -> None: + result = _make_result() + ids = [c.contact_id for c in result.contacts] + expected = [make_id(ID_PREFIXES["contact"], i) for i in range(1, _N_CONTACTS + 1)] + assert ids == expected + + +def test_lead_ids_are_sequential_and_unique() -> None: + result = _make_result() + ids = [lead.lead_id for lead in result.leads] + expected = [make_id(ID_PREFIXES["lead"], i) for i in range(1, _N_LEADS + 1)] + assert ids == expected + + +# --------------------------------------------------------------------------- +# FK integrity +# --------------------------------------------------------------------------- + + +def test_contact_account_ids_are_valid() -> None: + result = _make_result() + valid_acct_ids = {a.account_id for a in result.accounts} + for c in result.contacts: + assert c.account_id in valid_acct_ids, f"contact {c.contact_id} → unknown account" + + +def test_lead_contact_ids_are_valid() -> None: + result = _make_result() + valid_cnt_ids = {c.contact_id for c in result.contacts} + for lead in result.leads: + assert lead.contact_id in valid_cnt_ids, f"lead {lead.lead_id} → unknown contact" + + +def test_lead_account_ids_are_valid() -> None: + result = _make_result() + valid_acct_ids = {a.account_id for a in result.accounts} + for lead in result.leads: + assert lead.account_id in valid_acct_ids, f"lead {lead.lead_id} → unknown account" + + +def test_lead_contact_account_consistency() -> None: + """lead.account_id must match the account_id of lead.contact_id.""" + result = _make_result() + contact_to_account = {c.contact_id: c.account_id for c in result.contacts} + for lead in result.leads: + assert lead.account_id == contact_to_account[lead.contact_id] + + +# --------------------------------------------------------------------------- +# Latent value ranges and completeness +# --------------------------------------------------------------------------- + +_EXPECTED_ACCOUNT_TRAITS = { + "latent_account_fit", + "latent_budget_readiness", + "latent_process_maturity", +} +_EXPECTED_CONTACT_TRAITS = { + "latent_problem_awareness", + "latent_contact_authority", + "latent_responsiveness", + "latent_engagement_propensity", +} +_EXPECTED_LEAD_TRAITS = {"latent_sales_friction"} + + +def test_account_latent_traits_present() -> None: + result = _make_result() + for acct_id, traits in result.latent_state.account_latents.items(): + assert traits.keys() == _EXPECTED_ACCOUNT_TRAITS, f"account {acct_id}" + + +def test_contact_latent_traits_present() -> None: + result = _make_result() + for cnt_id, traits in result.latent_state.contact_latents.items(): + assert traits.keys() == _EXPECTED_CONTACT_TRAITS, f"contact {cnt_id}" + + +def test_lead_latent_traits_present() -> None: + result = _make_result() + for lead_id, traits in result.latent_state.lead_latents.items(): + assert traits.keys() == _EXPECTED_LEAD_TRAITS, f"lead {lead_id}" + + +def test_all_latent_values_in_unit_interval() -> None: + result = _make_result() + for store in ( + result.latent_state.account_latents, + result.latent_state.contact_latents, + result.latent_state.lead_latents, + ): + for entity_id, traits in store.items(): + for trait, val in traits.items(): + assert 0.0 <= val <= 1.0, f"{entity_id}.{trait} = {val}" + + +# --------------------------------------------------------------------------- +# Lead observable fields +# --------------------------------------------------------------------------- + + +def test_lead_initial_stage_is_mql() -> None: + result = _make_result() + for lead in result.leads: + assert lead.current_stage == "mql" + assert lead.is_mql is True + assert lead.is_sql is False + assert lead.converted_within_90_days is False + assert lead.conversion_timestamp is None + + +def test_lead_source_is_valid_channel() -> None: + from leadforge.api.generator import Generator + + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=_SEED) + narrative = gen.world_spec.narrative + assert narrative is not None + valid_channels = set(narrative.gtm_motion.channels) + result = _make_result() + for lead in result.leads: + assert lead.lead_source in valid_channels + assert lead.first_touch_channel == lead.lead_source + + +def test_lead_owner_rep_id_is_valid() -> None: + result = _make_result() + valid_rep_ids = {make_id(ID_PREFIXES["rep"], i) for i in range(1, _N_REPS + 1)} + for lead in result.leads: + assert lead.owner_rep_id in valid_rep_ids + + +def test_lead_created_at_within_base_window() -> None: + from datetime import date + + result = _make_result() + base = date(2024, 1, 1) + end = date(2024, 1, 30) + for lead in result.leads: + d = date.fromisoformat(lead.lead_created_at) + assert base <= d <= end, f"lead {lead.lead_id} created_at {d} out of window" + + +# --------------------------------------------------------------------------- +# Account observable fields +# --------------------------------------------------------------------------- + + +def test_account_industry_is_valid() -> None: + from leadforge.api.generator import Generator + + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=_SEED) + narrative = gen.world_spec.narrative + assert narrative is not None + valid = set(narrative.market.icp_industries) + result = _make_result() + for a in result.accounts: + assert a.industry in valid + + +def test_account_region_is_valid() -> None: + from leadforge.api.generator import Generator + + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=_SEED) + narrative = gen.world_spec.narrative + assert narrative is not None + valid = set(narrative.market.geographies) + result = _make_result() + for a in result.accounts: + assert a.region in valid + + +# --------------------------------------------------------------------------- +# Motif latent bias (property test across seeds) +# --------------------------------------------------------------------------- + + +def test_fit_dominant_raises_account_fit_mean() -> None: + """fit_dominant worlds should have higher mean latent_account_fit than + buying_committee_friction worlds across a range of seeds.""" + fit_means = [] + friction_means = [] + for seed in range(15): + config = GenerationConfig(seed=seed, n_accounts=200, n_contacts=400, n_leads=600) + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=seed) + narrative = gen.world_spec.narrative + assert narrative is not None + + g_fit = sample_hidden_graph(seed=seed, motif_family_name="fit_dominant") + r_fit = build_population(config, narrative, g_fit) + fit_means.append( + sum(t["latent_account_fit"] for t in r_fit.latent_state.account_latents.values()) + / config.n_accounts + ) + + g_fric = sample_hidden_graph(seed=seed, motif_family_name="buying_committee_friction") + r_fric = build_population(config, narrative, g_fric) + friction_means.append( + sum(t["latent_account_fit"] for t in r_fric.latent_state.account_latents.values()) + / config.n_accounts + ) + + avg_fit = sum(fit_means) / len(fit_means) + avg_fric = sum(friction_means) / len(friction_means) + assert avg_fit > avg_fric, ( + f"Expected fit_dominant mean ({avg_fit:.3f}) > buying_committee_friction ({avg_fric:.3f})" + ) + + +def test_buying_committee_friction_lowers_contact_authority() -> None: + """buying_committee_friction worlds should have lower mean latent_contact_authority.""" + bc_means = [] + fd_means = [] + for seed in range(15): + config = GenerationConfig(seed=seed, n_accounts=100, n_contacts=300, n_leads=400) + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=seed) + narrative = gen.world_spec.narrative + assert narrative is not None + + g_bc = sample_hidden_graph(seed=seed, motif_family_name="buying_committee_friction") + r_bc = build_population(config, narrative, g_bc) + bc_means.append( + sum(t["latent_contact_authority"] for t in r_bc.latent_state.contact_latents.values()) + / config.n_contacts + ) + + g_fd = sample_hidden_graph(seed=seed, motif_family_name="fit_dominant") + r_fd = build_population(config, narrative, g_fd) + fd_means.append( + sum(t["latent_contact_authority"] for t in r_fd.latent_state.contact_latents.values()) + / config.n_contacts + ) + + avg_bc = sum(bc_means) / len(bc_means) + avg_fd = sum(fd_means) / len(fd_means) + assert avg_bc < avg_fd, ( + f"Expected buying_committee_friction mean ({avg_bc:.3f}) < fit_dominant ({avg_fd:.3f})" + ) + + +# --------------------------------------------------------------------------- +# Latent state entity-ID alignment +# --------------------------------------------------------------------------- + + +def test_latent_state_account_ids_match_rows() -> None: + result = _make_result() + row_ids = {a.account_id for a in result.accounts} + assert set(result.latent_state.account_latents.keys()) == row_ids + + +def test_latent_state_contact_ids_match_rows() -> None: + result = _make_result() + row_ids = {c.contact_id for c in result.contacts} + assert set(result.latent_state.contact_latents.keys()) == row_ids + + +def test_latent_state_lead_ids_match_rows() -> None: + result = _make_result() + row_ids = {lead.lead_id for lead in result.leads} + assert set(result.latent_state.lead_latents.keys()) == row_ids From 0559a83d15f375cdfa53f9d4abb3b38ed15e91e4 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 28 Apr 2026 06:20:08 +0300 Subject: [PATCH 2/2] fix: address Copilot round-1 review on PR #10 - Docstring: correct determinism contract to include narrative and world_graph.motif_family (COPILOT-1) - build_population: add _validate_narrative() up-front guard that raises InvalidConfigError for empty industries, geographies, personas, or channels (COPILOT-2) - _channel_weights: fall back to uniform distribution when all GTM shares sum to zero, preventing random.choices ValueError (COPILOT-3) - 5 new tests covering all three fixes (363 total passing) Co-Authored-By: Claude Sonnet 4.6 --- leadforge/simulation/population.py | 40 ++++++++++++-- tests/simulation/test_population.py | 83 +++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+), 5 deletions(-) diff --git a/leadforge/simulation/population.py b/leadforge/simulation/population.py index 3b19d18..9f84e0b 100644 --- a/leadforge/simulation/population.py +++ b/leadforge/simulation/population.py @@ -28,6 +28,7 @@ from datetime import date, timedelta from typing import TYPE_CHECKING +from leadforge.core.exceptions import InvalidConfigError from leadforge.core.ids import ID_PREFIXES, make_id from leadforge.core.models import GenerationConfig from leadforge.core.rng import RNGRoot @@ -137,12 +138,15 @@ def build_population( """Generate accounts, contacts, leads, and their latent states. All randomness is derived from named substreams of ``RNGRoot(config.seed)`` - so the result is fully deterministic for a given ``(config, world_graph)``. + so the result is fully deterministic for a given + ``(config, narrative, world_graph.motif_family)``. Args: config: Fully resolved generation configuration (counts, seed, etc.). narrative: Parsed narrative spec providing ICP industries, geographies, - personas, and GTM channel mix. + personas, and GTM channel mix. Must have non-empty + ``market.icp_industries``, ``market.geographies``, ``personas``, + and ``gtm_motion.channels``. world_graph: The sampled hidden world graph; its ``motif_family`` is used to apply latent-trait mean biases that make the world structurally coherent. @@ -150,7 +154,11 @@ def build_population( Returns: A :class:`PopulationResult` containing the three entity lists and the full :class:`LatentState`. + + Raises: + InvalidConfigError: If any required narrative collection is empty. """ + _validate_narrative(narrative) root = RNGRoot(config.seed) bias = _MOTIF_LATENT_BIAS.get(world_graph.motif_family, {}) @@ -366,6 +374,20 @@ def _generate_leads( # --------------------------------------------------------------------------- +def _validate_narrative(narrative: NarrativeSpec) -> None: + """Raise :exc:`InvalidConfigError` if any collection required by population + generation is empty.""" + checks: list[tuple[object, str]] = [ + (narrative.market.icp_industries, "narrative.market.icp_industries"), + (narrative.market.geographies, "narrative.market.geographies"), + (narrative.personas, "narrative.personas"), + (narrative.gtm_motion.channels, "narrative.gtm_motion.channels"), + ] + for collection, name in checks: + if not collection: + raise InvalidConfigError(f"{name} must not be empty") + + def _sample_latent(rng: random.Random, mean: float = 0.50, std: float = 0.20) -> float: """Draw a latent trait value in [0, 1] from a clipped Gaussian.""" mean = max(0.10, min(0.90, mean)) @@ -373,16 +395,24 @@ def _sample_latent(rng: random.Random, mean: float = 0.50, std: float = 0.20) -> def _channel_weights(narrative: NarrativeSpec) -> tuple[list[str], list[float]]: - """Return (channels, weights) lists ordered as in the GTM spec.""" + """Return (channels, weights) lists ordered as in the GTM spec. + + If the per-channel share attributes sum to zero (all shares are 0), + falls back to a uniform distribution so ``random.choices`` never + receives an all-zero weight list. + """ gtm = narrative.gtm_motion channels: list[str] = [] weights: list[float] = [] for ch in gtm.channels: attr = _CHANNEL_TO_SHARE_ATTR.get(ch) channels.append(ch) - weights.append(float(getattr(gtm, attr)) if attr else 1.0 / len(gtm.channels)) - # Normalise in case shares don't sum to exactly 1.0 + weights.append(float(getattr(gtm, attr)) if attr else 0.0) total = sum(weights) if total > 0: weights = [w / total for w in weights] + else: + # All shares are zero — fall back to uniform. + uniform = 1.0 / len(channels) + weights = [uniform] * len(channels) return channels, weights diff --git a/tests/simulation/test_population.py b/tests/simulation/test_population.py index 61308d5..c41d657 100644 --- a/tests/simulation/test_population.py +++ b/tests/simulation/test_population.py @@ -2,12 +2,17 @@ from __future__ import annotations +import pytest + from leadforge.api.generator import Generator +from leadforge.core.exceptions import InvalidConfigError from leadforge.core.ids import ID_PREFIXES, make_id from leadforge.core.models import GenerationConfig +from leadforge.narrative.spec import NarrativeSpec from leadforge.simulation.population import ( _N_REPS, PopulationResult, + _channel_weights, build_population, ) from leadforge.structure.sampler import sample_hidden_graph @@ -349,3 +354,81 @@ def test_latent_state_lead_ids_match_rows() -> None: result = _make_result() row_ids = {lead.lead_id for lead in result.leads} assert set(result.latent_state.lead_latents.keys()) == row_ids + + +# --------------------------------------------------------------------------- +# Narrative validation (COPILOT-2 / COPILOT-3) +# --------------------------------------------------------------------------- + + +def _base_narrative() -> NarrativeSpec: + """Return a minimal valid NarrativeSpec for mutation tests.""" + gen = Generator.from_recipe("b2b_saas_procurement_v1", seed=0) + narrative = gen.world_spec.narrative + assert narrative is not None + return narrative + + +def _build_with_narrative(narrative: NarrativeSpec) -> PopulationResult: + config = GenerationConfig(seed=0, n_accounts=10, n_contacts=20, n_leads=30) + graph = sample_hidden_graph(seed=0) + return build_population(config, narrative, graph) + + +def test_empty_industries_raises() -> None: + import dataclasses + + narrative = _base_narrative() + bad_market = dataclasses.replace(narrative.market, icp_industries=()) + bad_narrative = dataclasses.replace(narrative, market=bad_market) + with pytest.raises(InvalidConfigError, match="icp_industries"): + _build_with_narrative(bad_narrative) + + +def test_empty_geographies_raises() -> None: + import dataclasses + + narrative = _base_narrative() + bad_market = dataclasses.replace(narrative.market, geographies=()) + bad_narrative = dataclasses.replace(narrative, market=bad_market) + with pytest.raises(InvalidConfigError, match="geographies"): + _build_with_narrative(bad_narrative) + + +def test_empty_personas_raises() -> None: + import dataclasses + + narrative = _base_narrative() + bad_narrative = dataclasses.replace(narrative, personas=()) + with pytest.raises(InvalidConfigError, match="personas"): + _build_with_narrative(bad_narrative) + + +def test_empty_channels_raises() -> None: + import dataclasses + + narrative = _base_narrative() + bad_gtm = dataclasses.replace(narrative.gtm_motion, channels=()) + bad_narrative = dataclasses.replace(narrative, gtm_motion=bad_gtm) + with pytest.raises(InvalidConfigError, match="channels"): + _build_with_narrative(bad_narrative) + + +def test_channel_weights_zero_shares_falls_back_to_uniform() -> None: + """If all GTM shares are 0, _channel_weights should return uniform weights.""" + narrative = _base_narrative() + import dataclasses + + bad_gtm = dataclasses.replace( + narrative.gtm_motion, + inbound_share=0.0, + outbound_share=0.0, + partner_share=0.0, + ) + bad_narrative = dataclasses.replace(narrative, gtm_motion=bad_gtm) + channels, weights = _channel_weights(bad_narrative) + assert len(channels) == len(weights) + assert all(w > 0 for w in weights) + assert abs(sum(weights) - 1.0) < 1e-9 + expected = 1.0 / len(channels) + assert all(abs(w - expected) < 1e-9 for w in weights)