diff --git a/.agent-plan.md b/.agent-plan.md index b52a85e..235747f 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -148,6 +148,15 @@ Documentation + CI: - [x] `leadforge/render/snapshots.py` — added comment documenting the feature-label temporal mismatch when `label_window_days < horizon_days` (features aggregate over full horizon; label uses shorter window) - [x] 7 new tests: default-90 unchanged, shorter window fewer conversions, 1-day window zero conversions, late conversions excluded (with day-offset verification), conversion_timestamp still set outside window, event counts unchanged by window, bundle round-trip integration; total 788 passing +### Fix: direct conversion bypass for pre-SQL leads (PR #45, closes #44) + +- [x] `leadforge/simulation/engine.py` — added `_DIRECT_CONVERSION_STAGES` and `_DIRECT_CONVERSION_DISCOUNT` (0.01) constants; pre-SQL leads (`mql`, `sal`) now have a small daily probability of converting directly, bypassing the full funnel +- [x] `leadforge/simulation/engine.py` — fixed `is_sql` post-sim computation to use `state.sql_day is not None` instead of also checking `current_stage in _SQL_OR_DEEPER` (which incorrectly flagged direct-converted leads as SQL due to `closed_won` being in that set) +- [x] `leadforge/simulation/engine.py` — extended opportunity/customer/subscription creation to handle direct-converted non-SQL leads +- [x] 5 new tests: non-SQL leads convert, non-SQL rate much lower than SQL rate (>5x), determinism, opportunity for direct-converted leads, customer+subscription for direct-converted leads +- [x] 2 existing tests updated: `test_most_converted_leads_are_sql` (was `test_converted_leads_also_sql`), `test_non_sql_non_converted_leads_no_opportunity` (was `test_non_sql_leads_no_opportunity`) +- [x] All 793 tests pass; lint + format clean + ### Generalize dataset card prose (PR #42, closes #38) - [x] `leadforge/schema/tasks.py` — `TaskManifest.description` rewritten for dataset-card use; `task_manifest_for_config()` produces task-specific prose (conversion-specific for default task, generic for others) @@ -208,7 +217,7 @@ Documentation + CI: - Plugin architecture - External-API enrichment - Web UI or dashboard -- Engine fix: `is_sql=False` → never converts (deterministic invariant) +- ~~Engine fix: `is_sql=False` → never converts (deterministic invariant)~~ — **Fixed** (PR #45, closes #44): added rare direct conversion path for pre-SQL leads --- diff --git a/leadforge/simulation/engine.py b/leadforge/simulation/engine.py index 61e434c..86473d5 100644 --- a/leadforge/simulation/engine.py +++ b/leadforge/simulation/engine.py @@ -18,6 +18,10 @@ - Stage advancement is driven by :class:`~leadforge.mechanisms.transitions.HazardTransition` (mql → … → negotiation); final conversion is driven by :class:`~leadforge.mechanisms.hazards.ConversionHazard` (negotiation → closed_won). +- A rare **direct conversion** path allows pre-SQL leads (``mql``, ``sal``) to + convert with a heavily discounted daily probability + (``_DIRECT_CONVERSION_DISCOUNT`` × normal hazard rate), breaking the + deterministic ``is_sql=False → never converts`` invariant. - A small daily churn probability independently moves any non-terminal lead to ``closed_lost``. @@ -49,6 +53,7 @@ from leadforge.core.models import GenerationConfig from leadforge.core.rng import RNGRoot from leadforge.mechanisms.base import MechanismContext +from leadforge.mechanisms.hazards import ConversionHazard from leadforge.mechanisms.policies import assign_mechanisms from leadforge.mechanisms.transitions import StageSequence from leadforge.schema.entities import ( @@ -71,6 +76,16 @@ # Daily churn probability from any active stage. _DAILY_CHURN_RATE = 0.004 +# Pre-SQL stages eligible for a rare direct conversion (bypassing the full +# funnel). The daily probability is the standard ConversionHazard rate +# multiplied by ``_DIRECT_CONVERSION_DISCOUNT``. +_DIRECT_CONVERSION_STAGES = frozenset({"mql", "sal"}) + +# Discount factor applied to the ConversionHazard daily probability for +# leads at pre-SQL stages. Keeps the full funnel relevant while breaking +# the deterministic ``is_sql=False → never converts`` invariant. +_DIRECT_CONVERSION_DISCOUNT = 0.01 + # Funnel stages that imply meaningful sales engagement → opportunity creation. _SQL_OR_DEEPER = frozenset( { @@ -184,6 +199,11 @@ def simulate_world( mechanisms = assign_mechanisms( world_graph.motif_family, mech_rng, latent_touch_intensity=latent_touch_intensity ) + # Narrow type for direct conversion path (daily_probability is on + # ConversionHazard, not the Mechanism ABC). + if not isinstance(mechanisms.conversion_hazard, ConversionHazard): + raise TypeError("conversion_hazard must be a ConversionHazard instance") + conversion_hazard: ConversionHazard = mechanisms.conversion_hazard stage_seq = StageSequence() # Build lookup indexes. @@ -251,7 +271,7 @@ def simulate_world( # -- 2. Stage advance or conversion check (transition stream) - if state.current_stage == "negotiation": # Final close: ConversionHazard decides closed_won. - if mechanisms.conversion_hazard.sample(ctx, transition_rng): + if conversion_hazard.sample(ctx, transition_rng): state.mark_converted(t) # Fall through to emit events on conversion day. else: @@ -261,6 +281,17 @@ def simulate_world( if next_s is not None: state.advance_stage(next_s, t) + # -- 2b. Direct conversion for pre-SQL leads (rare bypass) ----- + # Breaks the ``is_sql=False → never converts`` deterministic + # invariant by giving pre-SQL leads a small, latent-modulated + # daily probability of converting without reaching negotiation. + if not state.is_terminal and state.current_stage in _DIRECT_CONVERSION_STAGES: + direct_p = ( + conversion_hazard.daily_probability(ctx.latents) * _DIRECT_CONVERSION_DISCOUNT + ) + if transition_rng.random() < direct_p: + state.mark_converted(t) + # -- 3. Touches (event stream) -------------------------------- event_date = (lead_dates[lead.lead_id] + timedelta(days=t)).isoformat() @@ -334,7 +365,13 @@ def simulate_world( state = states[lead.lead_id] lead_date = lead_dates[lead.lead_id] - is_sql = state.sql_day is not None or state.current_stage in _SQL_OR_DEEPER + # A lead "is SQL" if it reached the SQL qualification gate through + # normal funnel progression. Direct-converted leads (mql/sal → + # closed_won) skip SQL and should NOT be flagged as SQL. + # Note: sql_day is set by advance_stage() when the lead enters any + # stage in _SQL_OR_DEEPER, and also at initialisation for leads that + # start at sql+ stages. It is NOT set by mark_converted(). + is_sql = state.sql_day is not None conv_ts: str | None = None if state.converted and state.conversion_day is not None: conv_ts = (lead_date + timedelta(days=state.conversion_day)).isoformat() @@ -366,11 +403,14 @@ def simulate_world( ) ) - # Opportunity: created when lead first reached sql or deeper. - if is_sql: + # Opportunity: created when lead first reached sql or deeper, + # or when a lead converted directly from a pre-SQL stage. + if is_sql or state.converted: opp_ctr += 1 opp_id = make_id(ID_PREFIXES["opportunity"], opp_ctr) - opp_day = state.sql_day if state.sql_day is not None else 0 + # For direct-converted leads (no SQL stage), anchor the + # opportunity at the conversion day instead. + opp_day = state.sql_day if state.sql_day is not None else (state.conversion_day or 0) opp_created_at = (lead_date + timedelta(days=opp_day)).isoformat() close_outcome: str | None = None diff --git a/tests/simulation/test_engine.py b/tests/simulation/test_engine.py index 316dbb9..aee1b3b 100644 --- a/tests/simulation/test_engine.py +++ b/tests/simulation/test_engine.py @@ -218,11 +218,14 @@ def test_sql_leads_are_flagged(self) -> None: }: assert lead.is_sql - def test_converted_leads_also_sql(self) -> None: - result = _run_sim(n_leads=100) - for lead in result.leads: - if lead.converted_within_90_days: - assert lead.is_sql + def test_most_converted_leads_are_sql(self) -> None: + """Most converted leads should have reached SQL, but direct conversion + allows some non-SQL leads to convert too.""" + result = _run_sim(n_leads=500, seed=42) + converted = [lead for lead in result.leads if lead.converted_within_90_days] + sql_converted = [lead for lead in converted if lead.is_sql] + # The vast majority of conversions should still go through SQL. + assert len(sql_converted) / len(converted) > 0.8 # --------------------------------------------------------------------------- @@ -238,11 +241,13 @@ def test_sql_leads_have_opportunity(self) -> None: if lead.is_sql: assert lead.lead_id in opp_lead_ids - def test_non_sql_leads_no_opportunity(self) -> None: + def test_non_sql_non_converted_leads_no_opportunity(self) -> None: + """Non-SQL leads that did NOT convert should have no opportunity. + Direct-converted non-SQL leads get an opportunity at conversion time.""" result = _run_sim(n_leads=100) opp_lead_ids = {o.lead_id for o in result.opportunities} for lead in result.leads: - if not lead.is_sql: + if not lead.is_sql and not lead.converted_within_90_days: assert lead.lead_id not in opp_lead_ids def test_opportunity_acv_positive(self) -> None: @@ -393,6 +398,77 @@ def test_enterprise(self) -> None: assert _plan_from_acv(200_000) == "enterprise" +# --------------------------------------------------------------------------- +# Direct conversion (pre-SQL bypass) +# --------------------------------------------------------------------------- + + +class TestDirectConversion: + """Verify the rare direct-conversion path for pre-SQL leads.""" + + def test_some_non_sql_leads_convert(self) -> None: + """With enough leads, at least one non-SQL lead should convert.""" + result = _run_sim(seed=42, n_leads=2000) + non_sql_converted = [ + lead for lead in result.leads if lead.converted_within_90_days and not lead.is_sql + ] + assert len(non_sql_converted) > 0, ( + "Expected at least one non-SQL conversion in 2000-lead sim" + ) + + def test_non_sql_conversion_rate_much_lower(self) -> None: + """Non-SQL conversion rate should be significantly lower than SQL.""" + result = _run_sim(seed=42, n_leads=2000) + sql_leads = [lead for lead in result.leads if lead.is_sql] + non_sql_leads = [lead for lead in result.leads if not lead.is_sql] + assert len(sql_leads) > 0 + assert len(non_sql_leads) > 0 + + sql_rate = sum(lead.converted_within_90_days for lead in sql_leads) / len(sql_leads) + non_sql_rate = sum(lead.converted_within_90_days for lead in non_sql_leads) / len( + non_sql_leads + ) + # Non-SQL rate should be at least 5x lower than SQL rate. + assert non_sql_rate < sql_rate / 5, ( + f"Non-SQL rate {non_sql_rate:.4f} not much lower than SQL rate {sql_rate:.4f}" + ) + + def test_direct_conversion_deterministic(self) -> None: + """Direct conversion path preserves full determinism.""" + r1 = _run_sim(seed=77, n_leads=500) + r2 = _run_sim(seed=77, n_leads=500) + labels1 = [row.converted_within_90_days for row in r1.leads] + labels2 = [row.converted_within_90_days for row in r2.leads] + assert labels1 == labels2 + + def test_direct_converted_lead_has_opportunity(self) -> None: + """A direct-converted non-SQL lead should still get an opportunity row.""" + result = _run_sim(seed=42, n_leads=2000) + opp_lead_ids = {o.lead_id for o in result.opportunities} + non_sql_converted = [ + lead for lead in result.leads if lead.converted_within_90_days and not lead.is_sql + ] + for lead in non_sql_converted: + assert lead.lead_id in opp_lead_ids + + def test_direct_converted_lead_has_customer_and_subscription(self) -> None: + """A direct-converted non-SQL lead should get customer + subscription rows.""" + result = _run_sim(seed=42, n_leads=2000) + cust_opp_ids = {c.opportunity_id for c in result.customers} + sub_cust_ids = {s.customer_id for s in result.subscriptions} + opp_by_lead = {o.lead_id: o for o in result.opportunities} + cust_by_opp = {c.opportunity_id: c for c in result.customers} + non_sql_converted = [ + lead for lead in result.leads if lead.converted_within_90_days and not lead.is_sql + ] + for lead in non_sql_converted: + opp = opp_by_lead.get(lead.lead_id) + assert opp is not None, f"No opportunity for direct-converted lead {lead.lead_id}" + assert opp.opportunity_id in cust_opp_ids + cust = cust_by_opp[opp.opportunity_id] + assert cust.customer_id in sub_cust_ids + + # --------------------------------------------------------------------------- # label_window_days affects label derivation # ---------------------------------------------------------------------------