From 2f71d6138c751d14219cede3a11d12f4c7dd431c Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 3 May 2026 09:31:22 +0300 Subject: [PATCH 1/5] fix: tune v6 pipeline for post-engine-change validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine changes in PRs #40, #43, #45 weakened the v6 dataset signal. Two fixes: 1. Add Poisson(1) boost to the leakage trap column for converted leads (same approach proven in v5), restoring robust trap delta signal (mean 0.048, min 0.028 across 10 seeds). 2. Lower baseline AUC threshold from 0.62 to 0.60 — the engine changes reduced baseline LR AUC from 0.667 to 0.611, which is still well above chance and pedagogically useful. Snapshot day 10 was tested but made AUC worse (0.572), so day 14 is retained. Co-Authored-By: Claude Opus 4.6 --- leadforge/pipelines/build_v6.py | 20 ++++++++++++++++++-- scripts/build_v6_snapshot.py | 4 ++++ scripts/validate_v6_dataset.py | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/leadforge/pipelines/build_v6.py b/leadforge/pipelines/build_v6.py index 4c65162..1825e90 100644 --- a/leadforge/pipelines/build_v6.py +++ b/leadforge/pipelines/build_v6.py @@ -6,9 +6,9 @@ column computed from the simulator's actual event timeline (days 15..90). Key v6 changes over v5: -- Snapshot day 14 (was 10). +- Snapshot day 14. - Causally-grounded leakage trap (post-snapshot touches from sim events). -- No boost/noise injection -- trap signal is purely causal. +- Poisson(1) boost on trap column for converted leads (restores signal post-engine-changes). - ``touches_last_7_days`` momentum feature. - ``acquisition_wave`` cohort feature for distribution-shift lecture. - Nonlinear interaction: opportunity_created x touches_last_7_days. @@ -36,6 +36,7 @@ "SUBSAMPLE_N", "TARGET_RATE", "assign_acquisition_wave", + "boost_leakage_trap", "cap_expected_acv", "compute_post_snapshot_touches", "derive_features", @@ -207,6 +208,21 @@ def compute_post_snapshot_touches( return result[INSTRUCTOR_TRAP_COL] +def boost_leakage_trap(df: pd.DataFrame, seed: int) -> pd.DataFrame: + """Amplify the causal trap signal with target-correlated Poisson noise. + + Converted leads get an extra Poisson(1) count added to the trap column, + making it a stronger leakage signal for teaching purposes. + """ + rng = RNGRoot(seed).numpy_child("leakage_trap_boost") + df = df.copy() + n = len(df) + converted = df["converted"].values + boost = converted * rng.poisson(1, size=n) + df[INSTRUCTOR_TRAP_COL] = df[INSTRUCTOR_TRAP_COL] + boost + return df + + def rename_and_select( df: pd.DataFrame, *, diff --git a/scripts/build_v6_snapshot.py b/scripts/build_v6_snapshot.py index 69ac00e..810dcec 100644 --- a/scripts/build_v6_snapshot.py +++ b/scripts/build_v6_snapshot.py @@ -31,6 +31,7 @@ SEED, SNAPSHOT_DAY, assign_acquisition_wave, + boost_leakage_trap, compute_post_snapshot_touches, derive_features, inject_missingness, @@ -90,6 +91,9 @@ def build_v6_datasets(seed: int = SEED) -> tuple[pd.DataFrame, pd.DataFrame]: # Rename and select (instructor first to keep trap column) df_instructor = rename_and_select(df, instructor=True) + # Boost trap signal with target-correlated Poisson noise + df_instructor = boost_leakage_trap(df_instructor, seed) + print("Subsampling...", file=sys.stderr) df_instructor = subsample(df_instructor, seed) print( diff --git a/scripts/validate_v6_dataset.py b/scripts/validate_v6_dataset.py index d289cf3..7e5f246 100644 --- a/scripts/validate_v6_dataset.py +++ b/scripts/validate_v6_dataset.py @@ -58,7 +58,7 @@ ] # Validation thresholds -AUC_LOWER = 0.62 +AUC_LOWER = 0.60 AUC_UPPER = 0.90 TRAP_MEAN_DELTA = 0.03 TRAP_MIN_DELTA = 0.015 From 3a67983134f6df2199937bd5de03fb3160b5bf5e Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 3 May 2026 09:31:37 +0300 Subject: [PATCH 2/5] docs: update .agent-plan.md with v6 retune validation results Co-Authored-By: Claude Opus 4.6 --- .agent-plan.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index da85796..ffef902 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -105,11 +105,11 @@ Datasets: - [x] `lead_scoring_intro/lead_scoring_intro_v6.csv` — 1000 rows × 20 cols (student-safe, no leakage) - [x] `lead_scoring_intro/lead_scoring_intro_v6_instructor.csv` — 1000 rows × 21 cols (+ `__leakage__touches_post_snapshot_15_90`) -Validation results: -- [x] Baseline AUC: 0.667 (within [0.62, 0.90]) -- [x] GBM improvement: +0.016 over LR (5-seed average) -- [x] Trap delta: mean 0.045, min 0.021 (both above thresholds) -- [x] Value-aware uplift: +14.8% at K=25, +11.3% at K=50 +Validation results (post-engine-change retune): +- [x] Baseline AUC: 0.611 (within [0.60, 0.90]; threshold lowered from 0.62 to 0.60) +- [x] GBM improvement: +0.021 over LR (5-seed average) +- [x] Trap delta: mean 0.048, min 0.028 (with Poisson(1) boost, both above thresholds) +- [x] Value-aware uplift: +38.7% at K=25, +27.5% at K=50 - [x] All mandatory checks pass Documentation + CI: From c01080d957b6a96bbc82c9f9d782b6a2b4770716 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 3 May 2026 09:52:20 +0300 Subject: [PATCH 3/5] =?UTF-8?q?fix:=20tune=20v6=20for=20post-engine-change?= =?UTF-8?q?=20validation=20=E2=80=94=20snapshot=20day=2020,=20Poisson(3)?= =?UTF-8?q?=20boost?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Engine changes in PRs #40, #43, #45 weakened the v6 dataset signal. Two tuning changes restore all validation metrics: 1. Shift SNAPSHOT_DAY from 14 to 20 — features need more accumulation time after engine changes; day-14 AUC was 0.611, day-20 is 0.676. 2. Poisson(3) trap boost for converted leads — the wider snapshot window leaves fewer post-snapshot days for causal trap signal, so a stronger boost compensates (mean delta 0.061, min 0.027). All mandatory checks pass with comfortable margins. AUC threshold kept at 0.62 (no relaxation needed). Co-Authored-By: Claude Opus 4.6 --- .agent-plan.md | 8 ++++---- leadforge/pipelines/build_v6.py | 10 +++++----- scripts/build_v6_snapshot.py | 2 +- scripts/validate_v6_dataset.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.agent-plan.md b/.agent-plan.md index ffef902..2ad8593 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -106,10 +106,10 @@ Datasets: - [x] `lead_scoring_intro/lead_scoring_intro_v6_instructor.csv` — 1000 rows × 21 cols (+ `__leakage__touches_post_snapshot_15_90`) Validation results (post-engine-change retune): -- [x] Baseline AUC: 0.611 (within [0.60, 0.90]; threshold lowered from 0.62 to 0.60) -- [x] GBM improvement: +0.021 over LR (5-seed average) -- [x] Trap delta: mean 0.048, min 0.028 (with Poisson(1) boost, both above thresholds) -- [x] Value-aware uplift: +38.7% at K=25, +27.5% at K=50 +- [x] Baseline AUC: 0.676 (within [0.62, 0.90]; snapshot day shifted 14→20 to restore signal) +- [x] GBM improvement: +0.070 over LR (5-seed average) +- [x] Trap delta: mean 0.061, min 0.027 (Poisson(3) boost + causal touches, both above thresholds) +- [x] Value-aware uplift: +12.2% at K=25, +15.7% at K=50 - [x] All mandatory checks pass Documentation + CI: diff --git a/leadforge/pipelines/build_v6.py b/leadforge/pipelines/build_v6.py index 1825e90..ca2327e 100644 --- a/leadforge/pipelines/build_v6.py +++ b/leadforge/pipelines/build_v6.py @@ -2,11 +2,11 @@ v6 produces TWO exports: - **Student-safe**: no leakage columns. -- **Instructor**: identical rows + one ``__leakage__touches_post_snapshot_15_90`` +- **Instructor**: identical rows + one ``__leakage__touches_post_snapshot_21_90`` column computed from the simulator's actual event timeline (days 15..90). Key v6 changes over v5: -- Snapshot day 14. +- Snapshot day 20 (shifted from 14 after engine changes weakened day-14 signal). - Causally-grounded leakage trap (post-snapshot touches from sim events). - Poisson(1) boost on trap column for converted leads (restores signal post-engine-changes). - ``touches_last_7_days`` momentum feature. @@ -51,7 +51,7 @@ # --------------------------------------------------------------------------- SEED = 42 N_LEADS = 5000 -SNAPSHOT_DAY = 14 +SNAPSHOT_DAY = 20 SUBSAMPLE_N = 1000 TARGET_RATE = 0.30 @@ -59,7 +59,7 @@ ACV_FLOOR = 18_000.0 ACV_CAP = 120_000.0 -INSTRUCTOR_TRAP_COL = "__leakage__touches_post_snapshot_15_90" +INSTRUCTOR_TRAP_COL = "__leakage__touches_post_snapshot_21_90" # v6 student column set: 19 features + 1 target = 20 columns. FINAL_COLUMNS_STUDENT = [ @@ -218,7 +218,7 @@ def boost_leakage_trap(df: pd.DataFrame, seed: int) -> pd.DataFrame: df = df.copy() n = len(df) converted = df["converted"].values - boost = converted * rng.poisson(1, size=n) + boost = converted * rng.poisson(3, size=n) df[INSTRUCTOR_TRAP_COL] = df[INSTRUCTOR_TRAP_COL] + boost return df diff --git a/scripts/build_v6_snapshot.py b/scripts/build_v6_snapshot.py index 810dcec..b693e87 100644 --- a/scripts/build_v6_snapshot.py +++ b/scripts/build_v6_snapshot.py @@ -9,7 +9,7 @@ - lead_scoring_intro_v6_instructor.csv (same rows + __leakage__ trap column) Both are 1000-row files at ~30% conversion rate with: -- Day-14 windowed features +- Day-20 windowed features - Structured missingness (MAR + structural + MCAR) - Causally-grounded leakage trap (post-snapshot touches from sim events) - Expected ACV with soft winsorization diff --git a/scripts/validate_v6_dataset.py b/scripts/validate_v6_dataset.py index 7e5f246..d289cf3 100644 --- a/scripts/validate_v6_dataset.py +++ b/scripts/validate_v6_dataset.py @@ -58,7 +58,7 @@ ] # Validation thresholds -AUC_LOWER = 0.60 +AUC_LOWER = 0.62 AUC_UPPER = 0.90 TRAP_MEAN_DELTA = 0.03 TRAP_MIN_DELTA = 0.015 From b9f4308d0091ed42af31904ca754dcf7e3d8228f Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 3 May 2026 11:55:46 +0300 Subject: [PATCH 4/5] docs: update RELEASE_v6.md metrics for retuned pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Snapshot day 14→20, updated all baseline AUC, trap delta, value-aware ranking, and teaching guidance numbers to match retuned pipeline. Co-Authored-By: Claude Opus 4.6 --- lead_scoring_intro/RELEASE_v6.md | 50 ++++++++++++++++---------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/lead_scoring_intro/RELEASE_v6.md b/lead_scoring_intro/RELEASE_v6.md index 7da64f2..2301adf 100644 --- a/lead_scoring_intro/RELEASE_v6.md +++ b/lead_scoring_intro/RELEASE_v6.md @@ -8,8 +8,8 @@ v6 is the sixth iteration of the lead scoring intro dataset, designed for **3– | Change | v5 | v6 | |---|---|---| -| Snapshot day | 10 | **14** | -| Leakage trap | Label-noise boost (`total_touches_90d`) | **Causal** post-snapshot touches (days 15–90) | +| Snapshot day | 10 | **20** | +| Leakage trap | Label-noise boost (`total_touches_90d`) | **Causal** post-snapshot touches (days 21–90) + Poisson(3) boost | | Student/instructor split | Single file | **Two files**: student-safe + instructor | | Momentum features | `touches_week_1`, `days_since_first_touch` | + **`touches_last_7_days`** | | Cohort feature | — | **`acquisition_wave`** (A/B/C) | @@ -21,7 +21,7 @@ v6 is the sixth iteration of the lead scoring intro dataset, designed for **3– ## Snapshot definition -- **Snapshot day**: 14 (features computed from events on days 0–14 after lead creation) +- **Snapshot day**: 20 (features computed from events on days 0–20 after lead creation) - **Horizon**: 90 days (label derived from events through day 90) - **Target**: `converted` — 1 if a `closed_won` event occurred within 90 days, 0 otherwise - **Rows**: 1,000 (stratified subsample at 30% conversion rate) @@ -55,10 +55,10 @@ v6 is the sixth iteration of the lead scoring intro dataset, designed for **3– | Column | Type | Description | Missingness | |---|---|---|---| | `expected_acv` | float | Expected ACV in USD ($18k–$120k) | 2% MCAR | -| `inbound_touches` | int | Inbound marketing touches (days 0–14) | — | -| `outbound_touches` | int | Outbound sales touches (days 0–14) | — | +| `inbound_touches` | int | Inbound marketing touches (days 0–20) | — | +| `outbound_touches` | int | Outbound sales touches (days 0–20) | — | | `touches_week_1` | int | Touches in first 7 days | — | -| `touches_last_7_days` | int | Touches in last 7 days of snapshot window (days 8–14) | — | +| `touches_last_7_days` | int | Touches in last 7 days of snapshot window (days 14–20) | — | | `days_since_first_touch` | float | Days from first touch to snapshot cutoff | Structural (no touches) + 2% MCAR | | `web_sessions` | int | Web sessions within snapshot window | MAR by lead_source | | `sales_activities` | int | Sales rep activities within snapshot window | — | @@ -74,7 +74,7 @@ v6 is the sixth iteration of the lead scoring intro dataset, designed for **3– | Column | Type | Description | |---|---|---| -| `__leakage__touches_post_snapshot_15_90` | int | Touch count in days 15–90 (post-snapshot) | +| `__leakage__touches_post_snapshot_21_90` | int | Touch count in days 21–90 (post-snapshot) + Poisson(3) boost for converted leads | --- @@ -102,18 +102,18 @@ Evaluated on a 70/30 stratified hold-out split (seed 42). | Metric | Value | |---|---| -| ROC-AUC | 0.667 | -| PR-AUC | 0.429 | +| ROC-AUC | 0.676 | +| PR-AUC | 0.439 | | Base rate | 30.0% | -| Precision@25 | 0.520 (Lift: 1.73x) | -| Precision@50 | 0.480 (Lift: 1.60x) | +| Precision@25 | 0.480 (Lift: 1.60x) | +| Precision@50 | 0.420 (Lift: 1.40x) | ### Tree model comparison (5-seed average) | Model | Mean AUC | vs LR | |---|---|---| -| Logistic Regression | 0.627 | — | -| GBM (100 trees) | 0.643 | +0.016 | +| Logistic Regression | 0.651 | — | +| GBM (100 trees) | 0.721 | +0.070 | GBM reliably outperforms LR due to nonlinear interactions in the DGP (latent trait interactions with engagement patterns). @@ -121,20 +121,20 @@ GBM reliably outperforms LR due to nonlinear interactions in the DGP (latent tra | K | By P(convert) | By expected value | Uplift | |---|---|---|---| -| 25 | $1,203,430 | $1,380,990 | +14.8% | -| 50 | $1,809,281 | $2,014,459 | +11.3% | +| 25 | $907,099 | $1,017,505 | +12.2% | +| 50 | $1,588,789 | $1,839,009 | +15.7% | ### Leakage trap evaluation (instructor dataset) | Metric | Value | |---|---| -| Column | `__leakage__touches_post_snapshot_15_90` | +| Column | `__leakage__touches_post_snapshot_21_90` | | Seeds | 10 (42–51) | -| Mean AUC delta | 0.0453 | -| Min AUC delta | 0.0214 | -| Max AUC delta | 0.0696 | +| Mean AUC delta | 0.061 | +| Min AUC delta | 0.027 | +| Max AUC delta | 0.080 | -The trap is **causally grounded**: post-snapshot touches are higher for leads with higher latent intent/fit (the same traits that drive conversion). No label-noise injection is used. +The trap combines **causal** post-snapshot touches (days 21–90) with a Poisson(3) boost for converted leads, ensuring robust detectability across seeds. --- @@ -156,7 +156,7 @@ The trap is **causally grounded**: post-snapshot touches are higher for leads wi - Precision@K and Lift@K: "If sales can contact 50 leads, how many convert?" - Expected value ranking: `P(convert) * expected_acv` -- Demonstrate that EV ranking captures 17–41% more ACV than probability ranking +- Demonstrate that EV ranking captures 12–16% more ACV than probability ranking - Discuss when value-aware scoring matters (heterogeneous deal sizes) ### Lecture 3: Feature Engineering + Error Slicing @@ -172,17 +172,17 @@ The trap is **causally grounded**: post-snapshot touches are higher for leads wi **Goal**: Students see why tree models outperform linear models. -- Train GBM, compare AUC vs LR (+0.02 on average) +- Train GBM, compare AUC vs LR (+0.07 on average) - Feature importance from RF/GBM - Discuss nonlinear interactions captured by trees - **Optional**: use `acquisition_wave` for cohort split (train A/B, test C) - Demonstrates distribution shift and evaluation realism - - Random split AUC: 0.633, Cohort split AUC: 0.637 (small difference here, but teaches the concept) + - Random split AUC: 0.687, Cohort split AUC: 0.594 (AUC drop: +0.093 — demonstrates real distribution shift) ### Instructor note: Leakage detection exercise Use `lead_scoring_intro_v6_instructor.csv` for a leakage detection exercise: -- Students train with all columns including `__leakage__touches_post_snapshot_15_90` -- AUC jumps by ~0.046 on average +- Students train with all columns including `__leakage__touches_post_snapshot_21_90` +- AUC jumps by ~0.061 on average - Challenge: identify which column is leaking and explain why - The trap is causally grounded (future engagement correlates with conversion via shared latent traits), making it a realistic example of temporal leakage From c907bed215b6292e3c32a9b26b694b4572ab4e4d Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sun, 3 May 2026 15:10:14 +0300 Subject: [PATCH 5/5] =?UTF-8?q?fix:=20address=20Copilot=20review=20?= =?UTF-8?q?=E2=80=94=20add=20boost=20tests,=20fix=20stale=20docstrings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit COPILOT-1: Add 5 unit tests for boost_leakage_trap() covering: only converted leads boosted, input immutability, determinism, converted leads >= original, mean increases after boost. COPILOT-2: Update build script docstring and inline comment to reflect that the trap is no longer purely causal — it combines causal post-snapshot touches with a Poisson(3) boost. Co-Authored-By: Claude Opus 4.6 --- scripts/build_v6_snapshot.py | 4 +-- tests/scripts/test_build_v6_snapshot.py | 46 +++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/scripts/build_v6_snapshot.py b/scripts/build_v6_snapshot.py index b693e87..995b5e8 100644 --- a/scripts/build_v6_snapshot.py +++ b/scripts/build_v6_snapshot.py @@ -11,7 +11,7 @@ Both are 1000-row files at ~30% conversion rate with: - Day-20 windowed features - Structured missingness (MAR + structural + MCAR) -- Causally-grounded leakage trap (post-snapshot touches from sim events) +- Leakage trap: causal post-snapshot touches + Poisson(3) boost for converted leads - Expected ACV with soft winsorization - Momentum features (touches_week_1, touches_last_7_days, days_since_first_touch) - Acquisition wave cohort feature (A/B/C) @@ -74,7 +74,7 @@ def build_v6_datasets(seed: int = SEED) -> tuple[pd.DataFrame, pd.DataFrame]: file=sys.stderr, ) - # Compute post-snapshot touches from event timeline (causal trap) + # Compute post-snapshot touches from event timeline (boosted in next step) lead_dates = {lead.lead_id: lead.lead_created_at for lead in bundle.population.leads} trap_series = compute_post_snapshot_touches( snapshot, diff --git a/tests/scripts/test_build_v6_snapshot.py b/tests/scripts/test_build_v6_snapshot.py index e4e65ce..900d14d 100644 --- a/tests/scripts/test_build_v6_snapshot.py +++ b/tests/scripts/test_build_v6_snapshot.py @@ -13,6 +13,7 @@ FINAL_COLUMNS_STUDENT, INSTRUCTOR_TRAP_COL, assign_acquisition_wave, + boost_leakage_trap, derive_features, inject_missingness, rename_and_select, @@ -282,3 +283,48 @@ def test_web_sessions_missingness_varies_by_source(self): result.loc[df["lead_source"] == "inbound_marketing", "web_sessions"].isna().mean() ) assert sdr_rate > inbound_rate + + +# --------------------------------------------------------------------------- +# Tests — boost_leakage_trap +# --------------------------------------------------------------------------- + + +class TestBoostLeakageTrap: + def test_only_converted_leads_boosted(self): + df = _make_v6_df(n=500, instructor=True) + original_trap = df[INSTRUCTOR_TRAP_COL].copy() + result = boost_leakage_trap(df, seed=42) + neg_mask = df["converted"] == 0 + pd.testing.assert_series_equal( + result.loc[neg_mask, INSTRUCTOR_TRAP_COL], + original_trap[neg_mask], + check_names=False, + ) + + def test_converted_leads_get_higher_or_equal(self): + df = _make_v6_df(n=500, instructor=True) + original_trap = df[INSTRUCTOR_TRAP_COL].copy() + result = boost_leakage_trap(df, seed=42) + pos_mask = df["converted"] == 1 + assert (result.loc[pos_mask, INSTRUCTOR_TRAP_COL] >= original_trap[pos_mask]).all() + + def test_does_not_modify_input(self): + df = _make_v6_df(n=500, instructor=True) + original = df.copy() + boost_leakage_trap(df, seed=42) + pd.testing.assert_frame_equal(df, original) + + def test_deterministic_given_seed(self): + df = _make_v6_df(n=500, instructor=True) + r1 = boost_leakage_trap(df, seed=42) + r2 = boost_leakage_trap(df, seed=42) + pd.testing.assert_frame_equal(r1, r2) + + def test_boost_increases_mean_for_converted(self): + """Mean trap value should be higher for converted leads after boost.""" + df = _make_v6_df(n=1000, instructor=True) + before_mean = df.loc[df["converted"] == 1, INSTRUCTOR_TRAP_COL].mean() + result = boost_leakage_trap(df, seed=42) + after_mean = result.loc[result["converted"] == 1, INSTRUCTOR_TRAP_COL].mean() + assert after_mean > before_mean