From a3abb1f76a17597bae02703ef81545bcbe58c796 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Thu, 30 Apr 2026 19:44:03 +0300
Subject: [PATCH 1/2] test: add 37 tests for build pipeline scripts
 (build_v5_snapshot, validate CLI)

Closes #23. Adds unit tests for all pipeline functions in
scripts/build_v5_snapshot.py and CLI integration tests for
scripts/validate_lead_scoring_dataset.py.

Coverage: subsample edge cases (insufficient positives/negatives),
inject_missingness rate bounds and source-conditional variation,
derive_binary_features, cap_expected_acv, rename_and_select,
boost_leakage_trap, and 6 CLI entrypoint tests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .agent-plan.md                          |   6 +
 tests/scripts/__init__.py               |   0
 tests/scripts/test_build_v5_snapshot.py | 374 ++++++++++++++++++++++++
 tests/scripts/test_validate_cli.py      | 180 ++++++++++++
 4 files changed, 560 insertions(+)
 create mode 100644 tests/scripts/__init__.py
 create mode 100644 tests/scripts/test_build_v5_snapshot.py
 create mode 100644 tests/scripts/test_validate_cli.py

diff --git a/.agent-plan.md b/.agent-plan.md
index c03e439..fd6a800 100644
--- a/.agent-plan.md
+++ b/.agent-plan.md
@@ -72,6 +72,12 @@ No engine changes required — v5 is a build pipeline + validation improvement.
 - [x] v5 dataset regenerated (snapshot day 10, trap boost) — all checks pass, exit code 0
 - [x] `RELEASE_v5.md` updated with canonical pipeline metrics
 
+### Build pipeline script tests (PR #28)
+
+- [x] `tests/scripts/test_build_v5_snapshot.py` — 31 tests covering `derive_binary_features()`, `cap_expected_acv()`, `rename_and_select()`, `subsample()`, `inject_missingness()`, `boost_leakage_trap()`
+- [x] `tests/scripts/test_validate_cli.py` — 6 tests covering CLI entrypoint (exit codes, `--out-json`, `--emit-release-snippet`, `--enforce-1000`, missing args)
+- [x] Tests cover edge cases: insufficient positives/negatives in subsampling, missingness rate bounds, source-conditional missingness variation, immutability of inputs, determinism given seed
+
 ---
 
 ## Deferred Items
diff --git a/tests/scripts/__init__.py b/tests/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/scripts/test_build_v5_snapshot.py b/tests/scripts/test_build_v5_snapshot.py
new file mode 100644
index 0000000..25c8e71
--- /dev/null
+++ b/tests/scripts/test_build_v5_snapshot.py
@@ -0,0 +1,374 @@
+"""Tests for scripts/build_v5_snapshot.py pipeline functions."""
+
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+# ---------------------------------------------------------------------------
+# Import the script module (not in a package, so use importlib)
+# ---------------------------------------------------------------------------
+_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "build_v5_snapshot.py"
+
+spec = importlib.util.spec_from_file_location("build_v5_snapshot", _SCRIPT_PATH)
+assert spec is not None
+assert spec.loader is not None
+build_v5 = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(build_v5)
+
+# Re-export for convenience
+subsample = build_v5.subsample
+inject_missingness = build_v5.inject_missingness
+derive_binary_features = build_v5.derive_binary_features
+cap_expected_acv = build_v5.cap_expected_acv
+rename_and_select = build_v5.rename_and_select
+boost_leakage_trap = build_v5.boost_leakage_trap
+ACV_FLOOR = build_v5.ACV_FLOOR
+ACV_CAP = build_v5.ACV_CAP
+_FINAL_COLUMNS = build_v5._FINAL_COLUMNS
+_RENAME_MAP = build_v5._RENAME_MAP
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_snapshot(
+    n: int = 500,
+    conversion_rate: float = 0.30,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """Build a minimal snapshot DataFrame that looks like build_snapshot() output.
+
+    Contains the pre-rename column names expected by the pipeline steps.
+    """
+    rng = np.random.RandomState(seed)
+    n_pos = int(n * conversion_rate)
+    n_neg = n - n_pos
+    converted = np.array([1] * n_pos + [0] * n_neg)
+    rng.shuffle(converted)
+
+    return pd.DataFrame(
+        {
+            "industry": rng.choice(["manufacturing", "logistics", "services"], size=n),
+            "region": rng.choice(["US", "UK", "EU"], size=n),
+            "employee_band": rng.choice(["200-499", "500-999", "1000-1999"], size=n),
+            "estimated_revenue_band": rng.choice(["$1M-$10M", "$10M-$50M", "$50M-$200M"], size=n),
+            "role_function": rng.choice(["finance", "ap_manager", "it_director"], size=n),
+            "seniority": rng.choice(
+                ["individual_contributor", "manager", "director", "vp"], size=n
+            ),
+            "lead_source": rng.choice(
+                ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n
+            ),
+            "opportunity_created": rng.choice([True, False], size=n),
+            "demo_page_views": rng.poisson(1, size=n),
+            "expected_acv": rng.uniform(5_000, 200_000, size=n).round(0),
+            "inbound_touch_count": rng.poisson(3, size=n),
+            "outbound_touch_count": rng.poisson(2, size=n),
+            "touches_week_1": rng.poisson(2, size=n),
+            "days_since_first_touch": rng.uniform(0, 14, size=n).round(1),
+            "session_count": rng.poisson(4, size=n).astype(float),
+            "activity_count": rng.poisson(3, size=n),
+            "days_since_last_touch": rng.uniform(0, 14, size=n).round(1),
+            "total_touches_all": rng.poisson(8, size=n),
+            "converted_within_90_days": converted,
+        }
+    )
+
+
+def _make_v5_df(
+    n: int = 500,
+    conversion_rate: float = 0.30,
+    seed: int = 42,
+) -> pd.DataFrame:
+    """Build a DataFrame in v5 format (post-rename, with all final columns)."""
+    snapshot = _make_snapshot(n=n, conversion_rate=conversion_rate, seed=seed)
+    df = derive_binary_features(snapshot)
+    df = cap_expected_acv(df)
+    return rename_and_select(df)
+
+
+# ---------------------------------------------------------------------------
+# Tests — derive_binary_features
+# ---------------------------------------------------------------------------
+
+
+class TestDeriveBinaryFeatures:
+    def test_opportunity_created_is_int(self):
+        snapshot = _make_snapshot()
+        result = derive_binary_features(snapshot)
+        assert result["opportunity_created"].dtype in (np.int64, np.int32, int)
+        assert set(result["opportunity_created"].unique()).issubset({0, 1})
+
+    def test_demo_completed_derived_from_page_views(self):
+        snapshot = _make_snapshot()
+        snapshot["demo_page_views"] = [0, 3, 0, 1, 0] * (len(snapshot) // 5)
+        result = derive_binary_features(snapshot)
+        expected = (snapshot["demo_page_views"] > 0).astype(int)
+        pd.testing.assert_series_equal(result["demo_completed"], expected, check_names=False)
+
+    def test_does_not_modify_input(self):
+        snapshot = _make_snapshot()
+        original = snapshot.copy()
+        derive_binary_features(snapshot)
+        pd.testing.assert_frame_equal(snapshot, original)
+
+
+# ---------------------------------------------------------------------------
+# Tests — cap_expected_acv
+# ---------------------------------------------------------------------------
+
+
+class TestCapExpectedACV:
+    def test_values_clipped_to_range(self):
+        snapshot = _make_snapshot()
+        snapshot["expected_acv"] = [1_000, 50_000, 200_000, ACV_FLOOR, ACV_CAP] * (
+            len(snapshot) // 5
+        )
+        result = cap_expected_acv(snapshot)
+        assert result["expected_acv"].min() >= ACV_FLOOR
+        assert result["expected_acv"].max() <= ACV_CAP
+
+    def test_values_within_range_unchanged(self):
+        snapshot = _make_snapshot()
+        snapshot["expected_acv"] = 50_000.0
+        result = cap_expected_acv(snapshot)
+        assert (result["expected_acv"] == 50_000.0).all()
+
+    def test_does_not_modify_input(self):
+        snapshot = _make_snapshot()
+        original = snapshot.copy()
+        cap_expected_acv(snapshot)
+        pd.testing.assert_frame_equal(snapshot, original)
+
+
+# ---------------------------------------------------------------------------
+# Tests — rename_and_select
+# ---------------------------------------------------------------------------
+
+
+class TestRenameAndSelect:
+    def test_output_columns_match_final(self):
+        df = _make_v5_df()
+        assert list(df.columns) == _FINAL_COLUMNS
+
+    def test_converted_is_int(self):
+        df = _make_v5_df()
+        assert df["converted"].dtype in (np.int64, np.int32, int)
+        assert set(df["converted"].unique()).issubset({0, 1})
+
+    def test_missing_column_raises(self):
+        snapshot = _make_snapshot()
+        snapshot = derive_binary_features(snapshot)
+        snapshot = cap_expected_acv(snapshot)
+        # Drop a required source column
+        snapshot = snapshot.drop(columns=["industry"])
+        with pytest.raises(ValueError, match="Missing required columns"):
+            rename_and_select(snapshot)
+
+    def test_rename_mapping_applied(self):
+        snapshot = _make_snapshot()
+        df = derive_binary_features(snapshot)
+        df = cap_expected_acv(df)
+        result = rename_and_select(df)
+        # All renamed columns should exist in output
+        for new_name in _RENAME_MAP.values():
+            assert new_name in result.columns
+
+
+# ---------------------------------------------------------------------------
+# Tests — subsample
+# ---------------------------------------------------------------------------
+
+
+class TestSubsample:
+    def test_output_size(self):
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        result = subsample(df, rng, n=100, target_rate=0.30)
+        assert len(result) == 100
+
+    def test_target_rate_approximate(self):
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        result = subsample(df, rng, n=200, target_rate=0.30)
+        actual_rate = result["converted"].mean()
+        assert actual_rate == pytest.approx(0.30, abs=0.01)
+
+    def test_deterministic_given_seed(self):
+        df = _make_v5_df(n=500)
+        r1 = subsample(df, np.random.RandomState(42), n=100, target_rate=0.30)
+        r2 = subsample(df, np.random.RandomState(42), n=100, target_rate=0.30)
+        pd.testing.assert_frame_equal(r1, r2)
+
+    def test_insufficient_positives(self, capsys):
+        """When fewer positives available than needed, warns and adjusts."""
+        df = _make_v5_df(n=200, conversion_rate=0.05)  # only ~10 positives
+        rng = np.random.RandomState(42)
+        result = subsample(df, rng, n=100, target_rate=0.50)  # need 50 positives
+        captured = capsys.readouterr()
+        assert "WARNING" in captured.err
+        # All available positives should be included
+        assert result["converted"].sum() <= 10
+
+    def test_insufficient_negatives(self, capsys):
+        """When fewer negatives available than needed, warns and adjusts."""
+        df = _make_v5_df(n=200, conversion_rate=0.95)  # only ~10 negatives
+        rng = np.random.RandomState(42)
+        subsample(df, rng, n=100, target_rate=0.10)  # need 90 negatives
+        captured = capsys.readouterr()
+        assert "WARNING" in captured.err
+
+    def test_index_is_reset(self):
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        result = subsample(df, rng, n=100, target_rate=0.30)
+        assert list(result.index) == list(range(len(result)))
+
+    def test_rows_come_from_input(self):
+        """All subsampled rows should exist in the original."""
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        result = subsample(df, rng, n=100, target_rate=0.30)
+        # Check a non-index column for membership
+        for val in result["expected_acv"]:
+            assert val in df["expected_acv"].values
+
+
+# ---------------------------------------------------------------------------
+# Tests — inject_missingness
+# ---------------------------------------------------------------------------
+
+
+class TestInjectMissingness:
+    def test_web_sessions_has_missing(self):
+        df = _make_v5_df(n=1000)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        assert result["web_sessions"].isna().sum() > 0
+
+    def test_seniority_has_missing(self):
+        df = _make_v5_df(n=1000)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        assert result["seniority"].isna().sum() > 0
+
+    def test_days_since_last_touch_has_missing(self):
+        df = _make_v5_df(n=1000)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        assert result["days_since_last_touch"].isna().sum() > 0
+
+    def test_days_since_first_touch_has_missing(self):
+        df = _make_v5_df(n=1000)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        assert result["days_since_first_touch"].isna().sum() > 0
+
+    def test_missingness_rates_bounded(self):
+        """Each column's missingness rate should stay under ~20% (well above contract <10%)."""
+        df = _make_v5_df(n=2000)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        for col in ["web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch"]:
+            rate = result[col].isna().mean()
+            assert rate < 0.20, f"{col} missingness rate {rate:.2%} exceeds 20%"
+
+    def test_other_columns_not_affected(self):
+        """Columns not in the missingness spec should have no new NaN."""
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        no_miss_cols = [
+            c
+            for c in _FINAL_COLUMNS
+            if c
+            not in ("web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch")
+        ]
+        for col in no_miss_cols:
+            orig_nan = df[col].isna().sum()
+            new_nan = result[col].isna().sum()
+            assert new_nan == orig_nan, f"{col} gained unexpected NaN"
+
+    def test_does_not_modify_input(self):
+        df = _make_v5_df(n=500)
+        original = df.copy()
+        rng = np.random.RandomState(42)
+        inject_missingness(df, rng)
+        pd.testing.assert_frame_equal(df, original)
+
+    def test_deterministic_given_seed(self):
+        df = _make_v5_df(n=500)
+        r1 = inject_missingness(df, np.random.RandomState(42))
+        r2 = inject_missingness(df, np.random.RandomState(42))
+        pd.testing.assert_frame_equal(r1, r2)
+
+    def test_web_sessions_missingness_varies_by_source(self):
+        """SDR outbound should have higher web_sessions missingness than inbound marketing."""
+        df = _make_v5_df(n=3000)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        sdr_rate = result.loc[df["lead_source"] == "sdr_outbound", "web_sessions"].isna().mean()
+        inbound_rate = (
+            result.loc[df["lead_source"] == "inbound_marketing", "web_sessions"].isna().mean()
+        )
+        assert sdr_rate > inbound_rate
+
+
+# ---------------------------------------------------------------------------
+# Tests — boost_leakage_trap
+# ---------------------------------------------------------------------------
+
+
+class TestBoostLeakageTrap:
+    def test_only_converted_leads_boosted(self):
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        trap_col = "__leakage__total_touches_90d"
+        original_trap = df[trap_col].copy()
+        result = boost_leakage_trap(df, rng)
+        # Non-converted leads should be unchanged
+        neg_mask = df["converted"] == 0
+        pd.testing.assert_series_equal(
+            result.loc[neg_mask, trap_col],
+            original_trap[neg_mask],
+            check_names=False,
+        )
+
+    def test_converted_leads_get_higher_or_equal(self):
+        df = _make_v5_df(n=500)
+        rng = np.random.RandomState(42)
+        trap_col = "__leakage__total_touches_90d"
+        original_trap = df[trap_col].copy()
+        result = boost_leakage_trap(df, rng)
+        pos_mask = df["converted"] == 1
+        assert (result.loc[pos_mask, trap_col] >= original_trap[pos_mask]).all()
+
+    def test_does_not_modify_input(self):
+        df = _make_v5_df(n=500)
+        original = df.copy()
+        rng = np.random.RandomState(42)
+        boost_leakage_trap(df, rng)
+        pd.testing.assert_frame_equal(df, original)
+
+    def test_deterministic_given_seed(self):
+        df = _make_v5_df(n=500)
+        r1 = boost_leakage_trap(df, np.random.RandomState(42))
+        r2 = boost_leakage_trap(df, np.random.RandomState(42))
+        pd.testing.assert_frame_equal(r1, r2)
+
+    def test_boost_increases_mean_for_converted(self):
+        """Mean trap value should be higher for converted leads after boost."""
+        df = _make_v5_df(n=1000)
+        rng = np.random.RandomState(42)
+        trap_col = "__leakage__total_touches_90d"
+        before_mean = df.loc[df["converted"] == 1, trap_col].mean()
+        result = boost_leakage_trap(df, rng)
+        after_mean = result.loc[result["converted"] == 1, trap_col].mean()
+        assert after_mean > before_mean
diff --git a/tests/scripts/test_validate_cli.py b/tests/scripts/test_validate_cli.py
new file mode 100644
index 0000000..a313eb2
--- /dev/null
+++ b/tests/scripts/test_validate_cli.py
@@ -0,0 +1,180 @@
+"""Tests for scripts/validate_lead_scoring_dataset.py CLI entrypoint."""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+# ---------------------------------------------------------------------------
+# Import the script module
+# ---------------------------------------------------------------------------
+_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "validate_lead_scoring_dataset.py"
+
+
+def _make_valid_csv(path: Path, n: int = 400, seed: int = 42) -> Path:
+    """Write a small CSV that passes validation (including baseline AUC ≥ 0.62).
+
+    Injects real feature-target correlation so the LR baseline achieves a
+    reasonable AUC despite the small sample size.
+    """
+    rng = np.random.RandomState(seed)
+
+    # Generate a latent score and derive conversion from it, ensuring signal.
+    # Shift bias so base rate ≈ 30%.
+    latent = rng.normal(0, 1, size=n)
+    prob = 1 / (1 + np.exp(-(1.5 * latent - 0.85)))  # shifted sigmoid
+    converted = (rng.random(n) < prob).astype(int)
+
+    # Correlated numeric features (positive correlation with latent).
+    inbound = np.clip(rng.poisson(3, size=n) + (latent * 1.5).astype(int), 0, None)
+    web_sessions = np.clip(rng.poisson(4, size=n) + (latent * 1.0).astype(int), 0, None).astype(
+        float
+    )
+    demo_completed = (latent + rng.normal(0, 0.8, size=n) > 0.3).astype(int)
+    opp_created = (latent + rng.normal(0, 0.8, size=n) > 0.0).astype(int)
+
+    df = pd.DataFrame(
+        {
+            "industry": rng.choice(
+                ["manufacturing", "logistics", "services", "healthcare"], size=n
+            ),
+            "region": rng.choice(["US", "UK"], size=n),
+            "company_size": rng.choice(["200-499", "500-999", "1000-1999", "2000+"], size=n),
+            "company_revenue": rng.choice(
+                ["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"], size=n
+            ),
+            "contact_role": rng.choice(
+                ["finance", "ap_manager", "it_director", "procurement"], size=n
+            ),
+            "seniority": rng.choice(
+                ["individual_contributor", "manager", "director", "vp", "c_suite"], size=n
+            ),
+            "lead_source": rng.choice(
+                ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n
+            ),
+            "opportunity_created": opp_created,
+            "demo_completed": demo_completed,
+            "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0),
+            "inbound_touches": inbound,
+            "outbound_touches": rng.poisson(2, size=n),
+            "touches_week_1": rng.poisson(2, size=n),
+            "days_since_first_touch": rng.uniform(0, 14, size=n).round(1),
+            "web_sessions": web_sessions,
+            "sales_activities": rng.poisson(3, size=n),
+            "days_since_last_touch": rng.uniform(0, 14, size=n).round(1),
+            "__leakage__total_touches_90d": converted * rng.poisson(8, size=n)
+            + rng.poisson(3, size=n),
+            "converted": converted,
+        }
+    )
+    # Inject small missingness to be realistic
+    miss_idx = rng.choice(n, size=int(n * 0.05), replace=False)
+    df.loc[miss_idx, "web_sessions"] = np.nan
+
+    csv_path = path / "valid.csv"
+    df.to_csv(csv_path, index=False)
+    return csv_path
+
+
+def _make_invalid_csv(path: Path) -> Path:
+    """Write a CSV missing the target column (will fail validation)."""
+    df = pd.DataFrame({"industry": ["a", "b"], "region": ["US", "UK"]})
+    csv_path = path / "invalid.csv"
+    df.to_csv(csv_path, index=False)
+    return csv_path
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestValidateCLI:
+    def test_valid_csv_exit_code_zero(self, tmp_path):
+        csv_path = _make_valid_csv(tmp_path)
+        result = subprocess.run(  # noqa: S603
+            [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        assert result.returncode == 0, f"stdout: {result.stdout}\nstderr: {result.stderr}"
+
+    def test_invalid_csv_exit_code_one(self, tmp_path):
+        csv_path = _make_invalid_csv(tmp_path)
+        result = subprocess.run(  # noqa: S603
+            [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        assert result.returncode == 1
+
+    def test_out_json_flag(self, tmp_path):
+        csv_path = _make_valid_csv(tmp_path)
+        json_path = tmp_path / "report.json"
+        subprocess.run(  # noqa: S603
+            [
+                sys.executable,
+                str(_SCRIPT_PATH),
+                "--csv",
+                str(csv_path),
+                "--out-json",
+                str(json_path),
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        # JSON report should be written regardless of pass/fail
+        assert json_path.exists()
+        report = json.loads(json_path.read_text())
+        assert "passed" in report
+        assert "checks" in report
+
+    def test_emit_release_snippet_flag(self, tmp_path):
+        csv_path = _make_valid_csv(tmp_path)
+        result = subprocess.run(  # noqa: S603
+            [
+                sys.executable,
+                str(_SCRIPT_PATH),
+                "--csv",
+                str(csv_path),
+                "--emit-release-snippet",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        # Snippet should be emitted regardless of pass/fail
+        assert "RELEASE SNIPPET" in result.stdout
+
+    def test_enforce_1000_flag_fails_on_small(self, tmp_path):
+        csv_path = _make_valid_csv(tmp_path, n=200)
+        result = subprocess.run(  # noqa: S603
+            [
+                sys.executable,
+                str(_SCRIPT_PATH),
+                "--csv",
+                str(csv_path),
+                "--enforce-1000",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        assert result.returncode == 1
+
+    def test_missing_csv_arg_fails(self):
+        result = subprocess.run(  # noqa: S603
+            [sys.executable, str(_SCRIPT_PATH)],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode != 0

From f087d764ce3e7ad8812736f062ed47c62c95f5a6 Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Thu, 30 Apr 2026 19:59:39 +0300
Subject: [PATCH 2/2] =?UTF-8?q?refactor:=20address=20PR=20review=20?=
 =?UTF-8?q?=E2=80=94=20shared=20fixtures,=20stronger=20assertions,=20bound?=
 =?UTF-8?q?ary=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Extract shared `make_v5_dataset()` builder into `tests/conftest.py`,
  eliminating duplicate synthetic data generators across 3 test files
- Delete 4 tautological missingness tests (trivially true at n=1000)
  and the weak `test_rows_come_from_input` (O(n^2), tests pandas internals)
- Strengthen `test_insufficient_negatives` to assert output composition
- Add `@pytest.mark.parametrize` for subsample target rate/seed combos
  and missingness rate bounds across multiple seeds
- Add boundary tests: `subsample` with n > input, `boost_leakage_trap`
  with zero converted leads, `inject_missingness` with small n and
  unknown lead sources, `rename_and_select` with extra columns
- Use session-scoped fixtures for CLI tests (avoid regenerating CSV 4x)
- Add precondition assertion on fixture conversion rate so sigmoid
  drift produces a clear error message instead of mysterious CLI failure

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/conftest.py                       | 143 ++++++++++++++++++++++
 tests/scripts/test_build_v5_snapshot.py | 146 ++++++++++++++---------
 tests/scripts/test_validate_cli.py      | 117 ++++++------------
 tests/validation/test_lead_scoring.py   | 150 ++++++------------------
 4 files changed, 305 insertions(+), 251 deletions(-)
 create mode 100644 tests/conftest.py

diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..60b1503
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,143 @@
+"""Shared test fixtures and helpers for leadforge tests."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+# ---------------------------------------------------------------------------
+# Canonical v5 column set (post-rename)
+# ---------------------------------------------------------------------------
+
+V5_COLUMNS = [
+    "industry",
+    "region",
+    "company_size",
+    "company_revenue",
+    "contact_role",
+    "seniority",
+    "lead_source",
+    "opportunity_created",
+    "demo_completed",
+    "expected_acv",
+    "inbound_touches",
+    "outbound_touches",
+    "touches_week_1",
+    "days_since_first_touch",
+    "web_sessions",
+    "sales_activities",
+    "days_since_last_touch",
+    "__leakage__total_touches_90d",
+    "converted",
+]
+
+# Category value pools shared across all synthetic data builders.
+INDUSTRIES = ["manufacturing", "logistics", "services", "healthcare"]
+REGIONS = ["US", "UK"]
+COMPANY_SIZES = ["200-499", "500-999", "1000-1999", "2000+"]
+COMPANY_REVENUES = ["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"]
+CONTACT_ROLES = ["finance", "ap_manager", "it_director", "procurement"]
+SENIORITIES = ["individual_contributor", "manager", "director", "vp", "c_suite"]
+LEAD_SOURCES = ["inbound_marketing", "sdr_outbound", "partner_referral"]
+
+
+def make_v5_dataset(
+    n: int = 200,
+    conversion_rate: float = 0.30,
+    include_leakage: bool = True,
+    deterministic_col: bool = False,
+    inject_signal: bool = False,
+    seed: int = 99,
+) -> pd.DataFrame:
+    """Build a synthetic dataset in v5 column format.
+
+    Parameters
+    ----------
+    n : int
+        Number of rows.
+    conversion_rate : float
+        Approximate target conversion rate (exact when inject_signal=False).
+    include_leakage : bool
+        Whether to include __leakage__total_touches_90d column.
+    deterministic_col : bool
+        If True, add a "bad_feature" column that perfectly predicts conversion
+        for a subgroup (useful for testing group determinism checks).
+    inject_signal : bool
+        If True, generate features that are correlated with the target so
+        that a logistic regression baseline can achieve AUC >= 0.62.  The
+        conversion rate is approximate (~30%) rather than exact.
+    seed : int
+        Random seed for reproducibility.
+    """
+    rng = np.random.RandomState(seed)
+
+    if inject_signal:
+        # Latent score drives both features and outcome.
+        # Bias of -0.85 targets ~30% base rate under the sigmoid.
+        latent = rng.normal(0, 1, size=n)
+        prob = 1 / (1 + np.exp(-(1.5 * latent - 0.85)))
+        converted = (rng.random(n) < prob).astype(int)
+
+        inbound = np.clip(rng.poisson(3, size=n) + (latent * 1.5).astype(int), 0, None)
+        web_sessions = np.clip(rng.poisson(4, size=n) + (latent * 1.0).astype(int), 0, None).astype(
+            float
+        )
+        demo_completed = (latent + rng.normal(0, 0.8, size=n) > 0.3).astype(int)
+        opp_created = (latent + rng.normal(0, 0.8, size=n) > 0.0).astype(int)
+    else:
+        n_pos = int(n * conversion_rate)
+        n_neg = n - n_pos
+        converted = np.array([1] * n_pos + [0] * n_neg)
+        rng.shuffle(converted)
+
+        inbound = rng.poisson(3, size=n)
+        web_sessions = rng.poisson(4, size=n).astype(float)
+        demo_completed = rng.randint(0, 2, size=n)
+        opp_created = rng.randint(0, 2, size=n)
+
+    df = pd.DataFrame(
+        {
+            "industry": rng.choice(INDUSTRIES, size=n),
+            "region": rng.choice(REGIONS, size=n),
+            "company_size": rng.choice(COMPANY_SIZES, size=n),
+            "company_revenue": rng.choice(COMPANY_REVENUES, size=n),
+            "contact_role": rng.choice(CONTACT_ROLES, size=n),
+            "seniority": rng.choice(SENIORITIES, size=n),
+            "lead_source": rng.choice(LEAD_SOURCES, size=n),
+            "opportunity_created": opp_created,
+            "demo_completed": demo_completed,
+            "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0),
+            "inbound_touches": inbound,
+            "outbound_touches": rng.poisson(2, size=n),
+            "touches_week_1": rng.poisson(2, size=n),
+            "days_since_first_touch": rng.uniform(0, 14, size=n).round(1),
+            "web_sessions": web_sessions,
+            "sales_activities": rng.poisson(3, size=n),
+            "days_since_last_touch": rng.uniform(0, 14, size=n).round(1),
+            "converted": converted,
+        }
+    )
+
+    # Inject some missingness
+    miss_idx = rng.choice(n, size=int(n * 0.05), replace=False)
+    df.loc[miss_idx, "web_sessions"] = np.nan
+
+    if include_leakage:
+        noise = rng.poisson(3, size=n)
+        df["__leakage__total_touches_90d"] = converted * rng.poisson(8, size=n) + noise
+
+    if deterministic_col:
+        df["bad_feature"] = "normal"
+        df.loc[:59, "bad_feature"] = "leaked"
+        df.loc[:59, "converted"] = 1
+
+    return df
+
+
+def save_csv(df: pd.DataFrame, tmp_path: Path, name: str = "data.csv") -> Path:
+    """Write a DataFrame to CSV and return the path."""
+    path = tmp_path / name
+    df.to_csv(path, index=False)
+    return path
diff --git a/tests/scripts/test_build_v5_snapshot.py b/tests/scripts/test_build_v5_snapshot.py
index 25c8e71..0772fb1 100644
--- a/tests/scripts/test_build_v5_snapshot.py
+++ b/tests/scripts/test_build_v5_snapshot.py
@@ -43,9 +43,10 @@ def _make_snapshot(
     conversion_rate: float = 0.30,
     seed: int = 42,
 ) -> pd.DataFrame:
-    """Build a minimal snapshot DataFrame that looks like build_snapshot() output.
+    """Build a minimal snapshot DataFrame with pre-rename column names.
 
-    Contains the pre-rename column names expected by the pipeline steps.
+    This is distinct from the shared ``make_v5_dataset`` because it uses
+    the *pre-rename* columns that ``build_snapshot()`` actually produces.
     """
     rng = np.random.RandomState(seed)
     n_pos = int(n * conversion_rate)
@@ -167,7 +168,6 @@ def test_missing_column_raises(self):
         snapshot = _make_snapshot()
         snapshot = derive_binary_features(snapshot)
         snapshot = cap_expected_acv(snapshot)
-        # Drop a required source column
         snapshot = snapshot.drop(columns=["industry"])
         with pytest.raises(ValueError, match="Missing required columns"):
             rename_and_select(snapshot)
@@ -177,10 +177,19 @@ def test_rename_mapping_applied(self):
         df = derive_binary_features(snapshot)
         df = cap_expected_acv(df)
         result = rename_and_select(df)
-        # All renamed columns should exist in output
         for new_name in _RENAME_MAP.values():
             assert new_name in result.columns
 
+    def test_extra_columns_are_dropped(self):
+        """Columns not in _FINAL_COLUMNS should be silently dropped."""
+        snapshot = _make_snapshot()
+        snapshot["extra_col"] = 999
+        df = derive_binary_features(snapshot)
+        df = cap_expected_acv(df)
+        result = rename_and_select(df)
+        assert "extra_col" not in result.columns
+        assert list(result.columns) == _FINAL_COLUMNS
+
 
 # ---------------------------------------------------------------------------
 # Tests — subsample
@@ -194,12 +203,16 @@ def test_output_size(self):
         result = subsample(df, rng, n=100, target_rate=0.30)
         assert len(result) == 100
 
-    def test_target_rate_approximate(self):
-        df = _make_v5_df(n=500)
-        rng = np.random.RandomState(42)
-        result = subsample(df, rng, n=200, target_rate=0.30)
+    @pytest.mark.parametrize(
+        ("target_rate", "seed"),
+        [(0.30, 42), (0.30, 99), (0.20, 42), (0.40, 7)],
+    )
+    def test_target_rate_approximate(self, target_rate, seed):
+        df = _make_v5_df(n=500, seed=seed)
+        rng = np.random.RandomState(seed)
+        result = subsample(df, rng, n=200, target_rate=target_rate)
         actual_rate = result["converted"].mean()
-        assert actual_rate == pytest.approx(0.30, abs=0.01)
+        assert actual_rate == pytest.approx(target_rate, abs=0.01)
 
     def test_deterministic_given_seed(self):
         df = _make_v5_df(n=500)
@@ -220,10 +233,16 @@ def test_insufficient_positives(self, capsys):
     def test_insufficient_negatives(self, capsys):
         """When fewer negatives available than needed, warns and adjusts."""
         df = _make_v5_df(n=200, conversion_rate=0.95)  # only ~10 negatives
+        n_neg_available = (df["converted"] == 0).sum()
         rng = np.random.RandomState(42)
-        subsample(df, rng, n=100, target_rate=0.10)  # need 90 negatives
+        result = subsample(df, rng, n=100, target_rate=0.10)  # need 90 negatives
         captured = capsys.readouterr()
         assert "WARNING" in captured.err
+        # Verify actual composition: negatives capped at available count
+        n_neg_result = (result["converted"] == 0).sum()
+        assert n_neg_result <= n_neg_available
+        # Output should still contain rows (not empty)
+        assert len(result) > 0
 
     def test_index_is_reset(self):
         df = _make_v5_df(n=500)
@@ -231,14 +250,15 @@ def test_index_is_reset(self):
         result = subsample(df, rng, n=100, target_rate=0.30)
         assert list(result.index) == list(range(len(result)))
 
-    def test_rows_come_from_input(self):
-        """All subsampled rows should exist in the original."""
-        df = _make_v5_df(n=500)
+    def test_n_larger_than_input_caps_gracefully(self, capsys):
+        """Requesting more rows than available caps at available count."""
+        df = _make_v5_df(n=50)
         rng = np.random.RandomState(42)
-        result = subsample(df, rng, n=100, target_rate=0.30)
-        # Check a non-index column for membership
-        for val in result["expected_acv"]:
-            assert val in df["expected_acv"].values
+        result = subsample(df, rng, n=200, target_rate=0.30)
+        captured = capsys.readouterr()
+        assert "WARNING" in captured.err
+        # Output should contain all available rows (capped)
+        assert len(result) <= len(df)
 
 
 # ---------------------------------------------------------------------------
@@ -247,36 +267,18 @@ def test_rows_come_from_input(self):
 
 
 class TestInjectMissingness:
-    def test_web_sessions_has_missing(self):
-        df = _make_v5_df(n=1000)
-        rng = np.random.RandomState(42)
+    @pytest.mark.parametrize("seed", [42, 99, 7])
+    def test_missingness_rates_bounded(self, seed):
+        """Each column's missingness rate should stay under 20% across seeds."""
+        df = _make_v5_df(n=2000, seed=seed)
+        rng = np.random.RandomState(seed)
         result = inject_missingness(df, rng)
-        assert result["web_sessions"].isna().sum() > 0
-
-    def test_seniority_has_missing(self):
-        df = _make_v5_df(n=1000)
-        rng = np.random.RandomState(42)
-        result = inject_missingness(df, rng)
-        assert result["seniority"].isna().sum() > 0
-
-    def test_days_since_last_touch_has_missing(self):
-        df = _make_v5_df(n=1000)
-        rng = np.random.RandomState(42)
-        result = inject_missingness(df, rng)
-        assert result["days_since_last_touch"].isna().sum() > 0
-
-    def test_days_since_first_touch_has_missing(self):
-        df = _make_v5_df(n=1000)
-        rng = np.random.RandomState(42)
-        result = inject_missingness(df, rng)
-        assert result["days_since_first_touch"].isna().sum() > 0
-
-    def test_missingness_rates_bounded(self):
-        """Each column's missingness rate should stay under ~20% (well above contract <10%)."""
-        df = _make_v5_df(n=2000)
-        rng = np.random.RandomState(42)
-        result = inject_missingness(df, rng)
-        for col in ["web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch"]:
+        for col in [
+            "web_sessions",
+            "seniority",
+            "days_since_last_touch",
+            "days_since_first_touch",
+        ]:
             rate = result[col].isna().mean()
             assert rate < 0.20, f"{col} missingness rate {rate:.2%} exceeds 20%"
 
@@ -285,16 +287,17 @@ def test_other_columns_not_affected(self):
         df = _make_v5_df(n=500)
         rng = np.random.RandomState(42)
         result = inject_missingness(df, rng)
-        no_miss_cols = [
-            c
-            for c in _FINAL_COLUMNS
-            if c
-            not in ("web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch")
-        ]
-        for col in no_miss_cols:
-            orig_nan = df[col].isna().sum()
-            new_nan = result[col].isna().sum()
-            assert new_nan == orig_nan, f"{col} gained unexpected NaN"
+        miss_cols = {
+            "web_sessions",
+            "seniority",
+            "days_since_last_touch",
+            "days_since_first_touch",
+        }
+        for col in _FINAL_COLUMNS:
+            if col not in miss_cols:
+                orig_nan = df[col].isna().sum()
+                new_nan = result[col].isna().sum()
+                assert new_nan == orig_nan, f"{col} gained unexpected NaN"
 
     def test_does_not_modify_input(self):
         df = _make_v5_df(n=500)
@@ -310,7 +313,7 @@ def test_deterministic_given_seed(self):
         pd.testing.assert_frame_equal(r1, r2)
 
     def test_web_sessions_missingness_varies_by_source(self):
-        """SDR outbound should have higher web_sessions missingness than inbound marketing."""
+        """SDR outbound should have higher web_sessions missingness than inbound."""
         df = _make_v5_df(n=3000)
         rng = np.random.RandomState(42)
         result = inject_missingness(df, rng)
@@ -320,6 +323,24 @@ def test_web_sessions_missingness_varies_by_source(self):
         )
         assert sdr_rate > inbound_rate
 
+    def test_small_n_no_crash(self):
+        """Should not crash on small DataFrames, even with sparse lead sources."""
+        df = _make_v5_df(n=10)
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        assert len(result) == 10
+
+    def test_no_matching_lead_source(self):
+        """If no rows match a source-conditional rate, no crash or extra NaN."""
+        df = _make_v5_df(n=100)
+        # Force all lead_source to a value not in the missingness spec
+        df["lead_source"] = "direct"
+        rng = np.random.RandomState(42)
+        result = inject_missingness(df, rng)
+        # web_sessions should only have missingness from other sources (none here)
+        # but days_since_last_touch still gets 3% MCAR
+        assert len(result) == 100
+
 
 # ---------------------------------------------------------------------------
 # Tests — boost_leakage_trap
@@ -333,7 +354,6 @@ def test_only_converted_leads_boosted(self):
         trap_col = "__leakage__total_touches_90d"
         original_trap = df[trap_col].copy()
         result = boost_leakage_trap(df, rng)
-        # Non-converted leads should be unchanged
         neg_mask = df["converted"] == 0
         pd.testing.assert_series_equal(
             result.loc[neg_mask, trap_col],
@@ -372,3 +392,13 @@ def test_boost_increases_mean_for_converted(self):
         result = boost_leakage_trap(df, rng)
         after_mean = result.loc[result["converted"] == 1, trap_col].mean()
         assert after_mean > before_mean
+
+    def test_zero_converted_leads_no_change(self):
+        """When no leads are converted, trap values should be unchanged."""
+        df = _make_v5_df(n=200, conversion_rate=0.30)
+        df["converted"] = 0  # force all negative
+        rng = np.random.RandomState(42)
+        trap_col = "__leakage__total_touches_90d"
+        original = df[trap_col].copy()
+        result = boost_leakage_trap(df, rng)
+        pd.testing.assert_series_equal(result[trap_col], original, check_names=False)
diff --git a/tests/scripts/test_validate_cli.py b/tests/scripts/test_validate_cli.py
index a313eb2..0fd31f3 100644
--- a/tests/scripts/test_validate_cli.py
+++ b/tests/scripts/test_validate_cli.py
@@ -7,8 +7,10 @@
 import sys
 from pathlib import Path
 
-import numpy as np
 import pandas as pd
+import pytest
+
+from tests.conftest import make_v5_dataset, save_csv
 
 # ---------------------------------------------------------------------------
 # Import the script module
@@ -16,77 +18,35 @@
 _SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "validate_lead_scoring_dataset.py"
 
 
-def _make_valid_csv(path: Path, n: int = 400, seed: int = 42) -> Path:
-    """Write a small CSV that passes validation (including baseline AUC ≥ 0.62).
+# ---------------------------------------------------------------------------
+# Session-scoped fixtures (avoid regenerating data per test)
+# ---------------------------------------------------------------------------
+
 
-    Injects real feature-target correlation so the LR baseline achieves a
-    reasonable AUC despite the small sample size.
+@pytest.fixture(scope="session")
+def valid_csv(tmp_path_factory: pytest.TempPathFactory) -> Path:
+    """Write a CSV that passes all validation checks.
+
+    Uses ``inject_signal=True`` so the LR baseline achieves AUC >= 0.62
+    (the shifted sigmoid with bias -0.85 targets ~30% conversion rate).
     """
-    rng = np.random.RandomState(seed)
-
-    # Generate a latent score and derive conversion from it, ensuring signal.
-    # Shift bias so base rate ≈ 30%.
-    latent = rng.normal(0, 1, size=n)
-    prob = 1 / (1 + np.exp(-(1.5 * latent - 0.85)))  # shifted sigmoid
-    converted = (rng.random(n) < prob).astype(int)
-
-    # Correlated numeric features (positive correlation with latent).
-    inbound = np.clip(rng.poisson(3, size=n) + (latent * 1.5).astype(int), 0, None)
-    web_sessions = np.clip(rng.poisson(4, size=n) + (latent * 1.0).astype(int), 0, None).astype(
-        float
-    )
-    demo_completed = (latent + rng.normal(0, 0.8, size=n) > 0.3).astype(int)
-    opp_created = (latent + rng.normal(0, 0.8, size=n) > 0.0).astype(int)
-
-    df = pd.DataFrame(
-        {
-            "industry": rng.choice(
-                ["manufacturing", "logistics", "services", "healthcare"], size=n
-            ),
-            "region": rng.choice(["US", "UK"], size=n),
-            "company_size": rng.choice(["200-499", "500-999", "1000-1999", "2000+"], size=n),
-            "company_revenue": rng.choice(
-                ["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"], size=n
-            ),
-            "contact_role": rng.choice(
-                ["finance", "ap_manager", "it_director", "procurement"], size=n
-            ),
-            "seniority": rng.choice(
-                ["individual_contributor", "manager", "director", "vp", "c_suite"], size=n
-            ),
-            "lead_source": rng.choice(
-                ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n
-            ),
-            "opportunity_created": opp_created,
-            "demo_completed": demo_completed,
-            "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0),
-            "inbound_touches": inbound,
-            "outbound_touches": rng.poisson(2, size=n),
-            "touches_week_1": rng.poisson(2, size=n),
-            "days_since_first_touch": rng.uniform(0, 14, size=n).round(1),
-            "web_sessions": web_sessions,
-            "sales_activities": rng.poisson(3, size=n),
-            "days_since_last_touch": rng.uniform(0, 14, size=n).round(1),
-            "__leakage__total_touches_90d": converted * rng.poisson(8, size=n)
-            + rng.poisson(3, size=n),
-            "converted": converted,
-        }
+    tmp = tmp_path_factory.mktemp("validate_cli")
+    df = make_v5_dataset(n=400, inject_signal=True, seed=42)
+    # Precondition: conversion rate must be in [15%, 40%] for checks to pass
+    rate = df["converted"].mean()
+    assert 0.15 <= rate <= 0.40, (
+        f"Fixture conversion rate {rate:.1%} outside [15%, 40%]; "
+        f"adjust sigmoid bias in make_v5_dataset(inject_signal=True)"
     )
-    # Inject small missingness to be realistic
-    miss_idx = rng.choice(n, size=int(n * 0.05), replace=False)
-    df.loc[miss_idx, "web_sessions"] = np.nan
-
-    csv_path = path / "valid.csv"
-    df.to_csv(csv_path, index=False)
-    return csv_path
+    return save_csv(df, tmp, "valid.csv")
 
 
-def _make_invalid_csv(path: Path) -> Path:
+@pytest.fixture(scope="session")
+def invalid_csv(tmp_path_factory: pytest.TempPathFactory) -> Path:
     """Write a CSV missing the target column (will fail validation)."""
+    tmp = tmp_path_factory.mktemp("validate_cli_invalid")
     df = pd.DataFrame({"industry": ["a", "b"], "region": ["US", "UK"]})
-    csv_path = path / "invalid.csv"
-    df.to_csv(csv_path, index=False)
-    return csv_path
+    return save_csv(df, tmp, "invalid.csv")
 
 
 # ---------------------------------------------------------------------------
@@ -95,35 +55,32 @@ def _make_invalid_csv(path: Path) -> Path:
 
 
 class TestValidateCLI:
-    def test_valid_csv_exit_code_zero(self, tmp_path):
-        csv_path = _make_valid_csv(tmp_path)
+    def test_valid_csv_exit_code_zero(self, valid_csv):
         result = subprocess.run(  # noqa: S603
-            [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)],
+            [sys.executable, str(_SCRIPT_PATH), "--csv", str(valid_csv)],
             capture_output=True,
             text=True,
             timeout=120,
         )
         assert result.returncode == 0, f"stdout: {result.stdout}\nstderr: {result.stderr}"
 
-    def test_invalid_csv_exit_code_one(self, tmp_path):
-        csv_path = _make_invalid_csv(tmp_path)
+    def test_invalid_csv_exit_code_one(self, invalid_csv):
         result = subprocess.run(  # noqa: S603
-            [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)],
+            [sys.executable, str(_SCRIPT_PATH), "--csv", str(invalid_csv)],
             capture_output=True,
             text=True,
             timeout=120,
         )
         assert result.returncode == 1
 
-    def test_out_json_flag(self, tmp_path):
-        csv_path = _make_valid_csv(tmp_path)
+    def test_out_json_flag(self, valid_csv, tmp_path):
         json_path = tmp_path / "report.json"
         subprocess.run(  # noqa: S603
             [
                 sys.executable,
                 str(_SCRIPT_PATH),
                 "--csv",
-                str(csv_path),
+                str(valid_csv),
                 "--out-json",
                 str(json_path),
             ],
@@ -131,37 +88,33 @@ def test_out_json_flag(self, tmp_path):
             text=True,
             timeout=120,
         )
-        # JSON report should be written regardless of pass/fail
         assert json_path.exists()
         report = json.loads(json_path.read_text())
         assert "passed" in report
         assert "checks" in report
 
-    def test_emit_release_snippet_flag(self, tmp_path):
-        csv_path = _make_valid_csv(tmp_path)
+    def test_emit_release_snippet_flag(self, valid_csv):
         result = subprocess.run(  # noqa: S603
             [
                 sys.executable,
                 str(_SCRIPT_PATH),
                 "--csv",
-                str(csv_path),
+                str(valid_csv),
                 "--emit-release-snippet",
             ],
             capture_output=True,
             text=True,
             timeout=120,
         )
-        # Snippet should be emitted regardless of pass/fail
         assert "RELEASE SNIPPET" in result.stdout
 
-    def test_enforce_1000_flag_fails_on_small(self, tmp_path):
-        csv_path = _make_valid_csv(tmp_path, n=200)
+    def test_enforce_1000_flag_fails_on_small(self, valid_csv):
         result = subprocess.run(  # noqa: S603
             [
                 sys.executable,
                 str(_SCRIPT_PATH),
                 "--csv",
-                str(csv_path),
+                str(valid_csv),
                 "--enforce-1000",
             ],
             capture_output=True,
diff --git a/tests/validation/test_lead_scoring.py b/tests/validation/test_lead_scoring.py
index 131dda1..d0b9023 100644
--- a/tests/validation/test_lead_scoring.py
+++ b/tests/validation/test_lead_scoring.py
@@ -20,102 +20,30 @@
     _check_schema,
     validate_dataset,
 )
+from tests.conftest import make_v5_dataset, save_csv
 
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
 
 
-def _make_dataset(
-    n: int = 200,
-    conversion_rate: float = 0.30,
-    include_leakage: bool = True,
-    deterministic_col: bool = False,
-    seed: int = 99,
-) -> pd.DataFrame:
-    """Build a small synthetic dataset that passes basic checks."""
-    rng = np.random.RandomState(seed)
-    n_pos = int(n * conversion_rate)
-    n_neg = n - n_pos
-
-    converted = np.array([1] * n_pos + [0] * n_neg)
-    rng.shuffle(converted)
-
-    industries = rng.choice(["manufacturing", "logistics", "services", "healthcare"], size=n)
-    regions = rng.choice(["US", "UK"], size=n)
-    sizes = rng.choice(["200-499", "500-999", "1000-1999", "2000+"], size=n)
-    revenues = rng.choice(["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"], size=n)
-    roles = rng.choice(["finance", "ap_manager", "it_director", "procurement"], size=n)
-    seniority = rng.choice(
-        ["individual_contributor", "manager", "director", "vp", "c_suite"], size=n
-    )
-    sources = rng.choice(["inbound_marketing", "sdr_outbound", "partner_referral"], size=n)
-
-    df = pd.DataFrame(
-        {
-            "industry": industries,
-            "region": regions,
-            "company_size": sizes,
-            "company_revenue": revenues,
-            "contact_role": roles,
-            "seniority": seniority,
-            "lead_source": sources,
-            "opportunity_created": rng.randint(0, 2, size=n),
-            "demo_completed": rng.randint(0, 2, size=n),
-            "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0),
-            "inbound_touches": rng.poisson(3, size=n),
-            "outbound_touches": rng.poisson(2, size=n),
-            "touches_week_1": rng.poisson(2, size=n),
-            "days_since_first_touch": rng.uniform(0, 14, size=n).round(1),
-            "web_sessions": rng.poisson(4, size=n).astype(float),
-            "sales_activities": rng.poisson(3, size=n),
-            "days_since_last_touch": rng.uniform(0, 14, size=n).round(1),
-            "converted": converted,
-        }
-    )
-
-    # Inject some missingness
-    miss_idx = rng.choice(n, size=int(n * 0.05), replace=False)
-    df.loc[miss_idx, "web_sessions"] = np.nan
-
-    if include_leakage:
-        # Leakage: positively correlated with target
-        noise = rng.poisson(3, size=n)
-        df["__leakage__total_touches_90d"] = converted * rng.poisson(8, size=n) + noise
-
-    if deterministic_col:
-        # Make a column that perfectly predicts conversion for a large group
-        df["bad_feature"] = "normal"
-        # First 60 rows all converted = 1
-        df.loc[:59, "bad_feature"] = "leaked"
-        df.loc[:59, "converted"] = 1
-
-    return df
-
-
-def _save(df: pd.DataFrame, tmp_path, name: str = "data.csv"):
-    path = tmp_path / name
-    df.to_csv(path, index=False)
-    return path
-
-
 @pytest.fixture
 def good_csv(tmp_path):
     """Write a well-formed synthetic dataset."""
-    return _save(_make_dataset(n=200, include_leakage=True), tmp_path, "good.csv")
+    return save_csv(make_v5_dataset(n=200, include_leakage=True), tmp_path, "good.csv")
 
 
 @pytest.fixture
 def bad_deterministic_csv(tmp_path):
     """Write a dataset with a deterministic group."""
-    return _save(_make_dataset(n=200, deterministic_col=True), tmp_path, "bad.csv")
+    return save_csv(make_v5_dataset(n=200, deterministic_col=True), tmp_path, "bad.csv")
 
 
 @pytest.fixture
 def no_target_csv(tmp_path):
     """Write a dataset missing the target column."""
-    df = _make_dataset(n=200).drop(columns=["converted"])
-    return _save(df, tmp_path, "no_target.csv")
+    df = make_v5_dataset(n=200).drop(columns=["converted"])
+    return save_csv(df, tmp_path, "no_target.csv")
 
 
 # ---------------------------------------------------------------------------
@@ -154,9 +82,9 @@ def test_missing_target_fails(self, no_target_csv):
         assert not target_check.passed
 
     def test_nan_target_short_circuits(self, tmp_path):
-        df = _make_dataset(n=200)
+        df = make_v5_dataset(n=200)
         df.loc[0, "converted"] = np.nan
-        path = _save(df, tmp_path, "nan_target.csv")
+        path = save_csv(df, tmp_path, "nan_target.csv")
         report = validate_dataset(path)
         # target_no_missing should fail
         no_miss = next(c for c in report.checks if c.name == "target_no_missing")
@@ -165,18 +93,18 @@ def test_nan_target_short_circuits(self, tmp_path):
         assert report.baseline is None
 
     def test_nonbinary_target_short_circuits(self, tmp_path):
-        df = _make_dataset(n=200)
+        df = make_v5_dataset(n=200)
         df.loc[0, "converted"] = 2
-        path = _save(df, tmp_path, "nonbinary.csv")
+        path = save_csv(df, tmp_path, "nonbinary.csv")
         report = validate_dataset(path)
         binary_check = next(c for c in report.checks if c.name == "target_binary")
         assert not binary_check.passed
         assert report.baseline is None
 
     def test_single_class_target_short_circuits(self, tmp_path):
-        df = _make_dataset(n=200)
+        df = make_v5_dataset(n=200)
         df["converted"] = 0  # all negatives
-        path = _save(df, tmp_path, "single_class.csv")
+        path = save_csv(df, tmp_path, "single_class.csv")
         report = validate_dataset(path)
         both = next(c for c in report.checks if c.name == "target_both_classes")
         assert not both.passed
@@ -188,7 +116,7 @@ def test_target_both_classes_passes(self, good_csv):
         assert both.passed
 
     def test_banned_columns_detected(self, tmp_path):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df["current_stage"] = "active"
         cfg = ValidationConfig(enforce_row_count=False)
         checks = _check_schema(df, cfg)
@@ -197,7 +125,7 @@ def test_banned_columns_detected(self, tmp_path):
         assert "current_stage" in banned.details
 
     def test_id_columns_detected(self, tmp_path):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df["lead_id"] = range(len(df))
         cfg = ValidationConfig(enforce_row_count=False)
         checks = _check_schema(df, cfg)
@@ -205,21 +133,21 @@ def test_id_columns_detected(self, tmp_path):
         assert not id_check.passed
 
     def test_enforce_row_count(self, tmp_path):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         cfg = ValidationConfig(enforce_row_count=True, expected_rows=1000)
         checks = _check_schema(df, cfg)
         rc = next(c for c in checks if c.name == "row_count")
         assert not rc.passed
 
     def test_exact_row_count_passes(self, tmp_path):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         cfg = ValidationConfig(enforce_row_count=True, expected_rows=200)
         checks = _check_schema(df, cfg)
         rc = next(c for c in checks if c.name == "row_count")
         assert rc.passed
 
     def test_duplicate_rows_detected(self, tmp_path):
-        df = _make_dataset(n=50, include_leakage=False)
+        df = make_v5_dataset(n=50, include_leakage=False)
         # Duplicate a lot of rows
         df = pd.concat([df, df], ignore_index=True)
         cfg = ValidationConfig(enforce_row_count=False)
@@ -237,7 +165,7 @@ def test_missing_expected_features_warned(self):
         assert "missing" in feat.details
 
     def test_total_touches_all_naming(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df["total_touches_all"] = 5
         cfg = ValidationConfig(enforce_row_count=False)
         checks = _check_schema(df, cfg)
@@ -245,7 +173,7 @@ def test_total_touches_all_naming(self):
         assert not naming.passed
 
     def test_no_leakage_columns(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         cfg = ValidationConfig(enforce_row_count=False)
         checks = _check_schema(df, cfg)
         naming = next(c for c in checks if c.name == "leakage_naming")
@@ -253,7 +181,7 @@ def test_no_leakage_columns(self):
         assert "no leakage" in naming.details
 
     def test_multiple_leakage_columns(self):
-        df = _make_dataset(n=200, include_leakage=True)
+        df = make_v5_dataset(n=200, include_leakage=True)
         df["__leakage__another"] = 1
         cfg = ValidationConfig(enforce_row_count=False)
         checks = _check_schema(df, cfg)
@@ -269,14 +197,14 @@ def test_multiple_leakage_columns(self):
 
 class TestMissingness:
     def test_high_missingness_fails(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df.loc[:40, "inbound_touches"] = np.nan  # >20% missing
         cfg = ValidationConfig(max_col_missing_rate=0.10)
         checks, miss_map = _check_missingness(df, cfg)
         assert not checks[0].passed
 
     def test_low_missingness_passes(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         cfg = ValidationConfig(max_col_missing_rate=0.10)
         checks, _ = _check_missingness(df, cfg)
         assert checks[0].passed
@@ -296,7 +224,7 @@ def test_deterministic_group_fails(self, bad_deterministic_csv):
 
     def test_low_conversion_group_fails(self, tmp_path):
         """A group where conversion rate is near 0% should also fail."""
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df["bad_feature"] = "normal"
         # First 60 rows all converted = 0 for this group
         df.loc[:59, "bad_feature"] = "zero_group"
@@ -320,12 +248,12 @@ def test_good_dataset_passes_determinism(self, good_csv):
 class TestConversionRate:
     def test_rate_outside_range_fails(self):
         # 5% conversion rate — below 15%
-        df = _make_dataset(n=200, conversion_rate=0.05, include_leakage=False)
+        df = make_v5_dataset(n=200, conversion_rate=0.05, include_leakage=False)
         checks = _check_conversion_rate(df)
         assert not checks[0].passed
 
     def test_rate_in_range_passes(self):
-        df = _make_dataset(n=200, conversion_rate=0.30, include_leakage=False)
+        df = make_v5_dataset(n=200, conversion_rate=0.30, include_leakage=False)
         checks = _check_conversion_rate(df)
         assert checks[0].passed
 
@@ -337,31 +265,31 @@ def test_rate_in_range_passes(self):
 
 class TestACVRange:
     def test_no_acv_column_skips(self):
-        df = _make_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"])
+        df = make_v5_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"])
         checks = _check_acv_range(df)
         assert checks[0].passed
         assert "skip" in checks[0].details
 
     def test_acv_all_nan_fails(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df["expected_acv"] = np.nan
         checks = _check_acv_range(df)
         assert not checks[0].passed
 
     def test_acv_below_floor_fails(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df.loc[0, "expected_acv"] = 1000  # way below 18k
         checks = _check_acv_range(df)
         assert not checks[0].passed
 
     def test_acv_above_cap_fails(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df.loc[0, "expected_acv"] = 200_000  # way above 120k
         checks = _check_acv_range(df)
         assert not checks[0].passed
 
     def test_acv_in_range_passes(self):
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         checks = _check_acv_range(df)
         assert checks[0].passed
 
@@ -414,8 +342,8 @@ def test_baseline_deterministic(self, good_csv):
 
     def test_k_larger_than_test_set_skipped(self, tmp_path):
         """If k > test set size, that k is skipped."""
-        df = _make_dataset(n=20, include_leakage=False)
-        path = _save(df, tmp_path)
+        df = make_v5_dataset(n=20, include_leakage=False)
+        path = save_csv(df, tmp_path)
         # ks=(25, 50) but test set is only ~6 rows
         report = validate_dataset(path, ValidationConfig(enforce_row_count=False))
         assert report.baseline is not None
@@ -438,8 +366,8 @@ def test_trap_detected(self, good_csv):
         assert tm.mean_delta_auc > 0
 
     def test_no_trap_columns_skips(self, tmp_path):
-        df = _make_dataset(n=200, include_leakage=False)
-        path = _save(df, tmp_path)
+        df = make_v5_dataset(n=200, include_leakage=False)
+        path = save_csv(df, tmp_path)
         report = validate_dataset(path, ValidationConfig(enforce_row_count=False))
         trap_check = [c for c in report.checks if c.name.startswith("leakage_trap")]
         assert len(trap_check) == 1
@@ -448,11 +376,11 @@ def test_no_trap_columns_skips(self, tmp_path):
 
     def test_weak_trap_fails_checks(self, tmp_path):
         """A trap column with no signal should fail threshold checks."""
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         rng = np.random.RandomState(42)
         # Add a random column with no leakage signal
         df["__leakage__noise"] = rng.poisson(5, size=len(df))
-        path = _save(df, tmp_path)
+        path = save_csv(df, tmp_path)
         cfg = ValidationConfig(
             enforce_row_count=False,
             trap_mean_delta=0.05,  # high threshold
@@ -480,17 +408,17 @@ def test_value_metrics_computed(self, good_csv):
 
     def test_value_metrics_with_nan_acv(self, tmp_path):
         """NaN in expected_acv should not propagate NaN into value metrics."""
-        df = _make_dataset(n=200, include_leakage=False)
+        df = make_v5_dataset(n=200, include_leakage=False)
         df.loc[:9, "expected_acv"] = np.nan
-        path = _save(df, tmp_path)
+        path = save_csv(df, tmp_path)
         report = validate_dataset(path, ValidationConfig(enforce_row_count=False))
         for vm in report.value_metrics:
             assert not np.isnan(vm.captured_acv_by_prob)
             assert not np.isnan(vm.captured_acv_by_ev)
 
     def test_no_acv_column_returns_empty(self, tmp_path):
-        df = _make_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"])
-        path = _save(df, tmp_path)
+        df = make_v5_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"])
+        path = save_csv(df, tmp_path)
         report = validate_dataset(path, ValidationConfig(enforce_row_count=False))
         assert report.value_metrics == []