From a3abb1f76a17597bae02703ef81545bcbe58c796 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 30 Apr 2026 19:44:03 +0300 Subject: [PATCH 1/2] test: add 37 tests for build pipeline scripts (build_v5_snapshot, validate CLI) Closes #23. Adds unit tests for all pipeline functions in scripts/build_v5_snapshot.py and CLI integration tests for scripts/validate_lead_scoring_dataset.py. Coverage: subsample edge cases (insufficient positives/negatives), inject_missingness rate bounds and source-conditional variation, derive_binary_features, cap_expected_acv, rename_and_select, boost_leakage_trap, and 6 CLI entrypoint tests. Co-Authored-By: Claude Opus 4.6 --- .agent-plan.md | 6 + tests/scripts/__init__.py | 0 tests/scripts/test_build_v5_snapshot.py | 374 ++++++++++++++++++++++++ tests/scripts/test_validate_cli.py | 180 ++++++++++++ 4 files changed, 560 insertions(+) create mode 100644 tests/scripts/__init__.py create mode 100644 tests/scripts/test_build_v5_snapshot.py create mode 100644 tests/scripts/test_validate_cli.py diff --git a/.agent-plan.md b/.agent-plan.md index c03e439..fd6a800 100644 --- a/.agent-plan.md +++ b/.agent-plan.md @@ -72,6 +72,12 @@ No engine changes required — v5 is a build pipeline + validation improvement. - [x] v5 dataset regenerated (snapshot day 10, trap boost) — all checks pass, exit code 0 - [x] `RELEASE_v5.md` updated with canonical pipeline metrics +### Build pipeline script tests (PR #28) + +- [x] `tests/scripts/test_build_v5_snapshot.py` — 31 tests covering `derive_binary_features()`, `cap_expected_acv()`, `rename_and_select()`, `subsample()`, `inject_missingness()`, `boost_leakage_trap()` +- [x] `tests/scripts/test_validate_cli.py` — 6 tests covering CLI entrypoint (exit codes, `--out-json`, `--emit-release-snippet`, `--enforce-1000`, missing args) +- [x] Tests cover edge cases: insufficient positives/negatives in subsampling, missingness rate bounds, source-conditional missingness variation, immutability of inputs, determinism given seed + --- ## Deferred Items diff --git a/tests/scripts/__init__.py b/tests/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/scripts/test_build_v5_snapshot.py b/tests/scripts/test_build_v5_snapshot.py new file mode 100644 index 0000000..25c8e71 --- /dev/null +++ b/tests/scripts/test_build_v5_snapshot.py @@ -0,0 +1,374 @@ +"""Tests for scripts/build_v5_snapshot.py pipeline functions.""" + +from __future__ import annotations + +import importlib.util +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +# --------------------------------------------------------------------------- +# Import the script module (not in a package, so use importlib) +# --------------------------------------------------------------------------- +_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "build_v5_snapshot.py" + +spec = importlib.util.spec_from_file_location("build_v5_snapshot", _SCRIPT_PATH) +assert spec is not None +assert spec.loader is not None +build_v5 = importlib.util.module_from_spec(spec) +spec.loader.exec_module(build_v5) + +# Re-export for convenience +subsample = build_v5.subsample +inject_missingness = build_v5.inject_missingness +derive_binary_features = build_v5.derive_binary_features +cap_expected_acv = build_v5.cap_expected_acv +rename_and_select = build_v5.rename_and_select +boost_leakage_trap = build_v5.boost_leakage_trap +ACV_FLOOR = build_v5.ACV_FLOOR +ACV_CAP = build_v5.ACV_CAP +_FINAL_COLUMNS = build_v5._FINAL_COLUMNS +_RENAME_MAP = build_v5._RENAME_MAP + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_snapshot( + n: int = 500, + conversion_rate: float = 0.30, + seed: int = 42, +) -> pd.DataFrame: + """Build a minimal snapshot DataFrame that looks like build_snapshot() output. + + Contains the pre-rename column names expected by the pipeline steps. + """ + rng = np.random.RandomState(seed) + n_pos = int(n * conversion_rate) + n_neg = n - n_pos + converted = np.array([1] * n_pos + [0] * n_neg) + rng.shuffle(converted) + + return pd.DataFrame( + { + "industry": rng.choice(["manufacturing", "logistics", "services"], size=n), + "region": rng.choice(["US", "UK", "EU"], size=n), + "employee_band": rng.choice(["200-499", "500-999", "1000-1999"], size=n), + "estimated_revenue_band": rng.choice(["$1M-$10M", "$10M-$50M", "$50M-$200M"], size=n), + "role_function": rng.choice(["finance", "ap_manager", "it_director"], size=n), + "seniority": rng.choice( + ["individual_contributor", "manager", "director", "vp"], size=n + ), + "lead_source": rng.choice( + ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n + ), + "opportunity_created": rng.choice([True, False], size=n), + "demo_page_views": rng.poisson(1, size=n), + "expected_acv": rng.uniform(5_000, 200_000, size=n).round(0), + "inbound_touch_count": rng.poisson(3, size=n), + "outbound_touch_count": rng.poisson(2, size=n), + "touches_week_1": rng.poisson(2, size=n), + "days_since_first_touch": rng.uniform(0, 14, size=n).round(1), + "session_count": rng.poisson(4, size=n).astype(float), + "activity_count": rng.poisson(3, size=n), + "days_since_last_touch": rng.uniform(0, 14, size=n).round(1), + "total_touches_all": rng.poisson(8, size=n), + "converted_within_90_days": converted, + } + ) + + +def _make_v5_df( + n: int = 500, + conversion_rate: float = 0.30, + seed: int = 42, +) -> pd.DataFrame: + """Build a DataFrame in v5 format (post-rename, with all final columns).""" + snapshot = _make_snapshot(n=n, conversion_rate=conversion_rate, seed=seed) + df = derive_binary_features(snapshot) + df = cap_expected_acv(df) + return rename_and_select(df) + + +# --------------------------------------------------------------------------- +# Tests — derive_binary_features +# --------------------------------------------------------------------------- + + +class TestDeriveBinaryFeatures: + def test_opportunity_created_is_int(self): + snapshot = _make_snapshot() + result = derive_binary_features(snapshot) + assert result["opportunity_created"].dtype in (np.int64, np.int32, int) + assert set(result["opportunity_created"].unique()).issubset({0, 1}) + + def test_demo_completed_derived_from_page_views(self): + snapshot = _make_snapshot() + snapshot["demo_page_views"] = [0, 3, 0, 1, 0] * (len(snapshot) // 5) + result = derive_binary_features(snapshot) + expected = (snapshot["demo_page_views"] > 0).astype(int) + pd.testing.assert_series_equal(result["demo_completed"], expected, check_names=False) + + def test_does_not_modify_input(self): + snapshot = _make_snapshot() + original = snapshot.copy() + derive_binary_features(snapshot) + pd.testing.assert_frame_equal(snapshot, original) + + +# --------------------------------------------------------------------------- +# Tests — cap_expected_acv +# --------------------------------------------------------------------------- + + +class TestCapExpectedACV: + def test_values_clipped_to_range(self): + snapshot = _make_snapshot() + snapshot["expected_acv"] = [1_000, 50_000, 200_000, ACV_FLOOR, ACV_CAP] * ( + len(snapshot) // 5 + ) + result = cap_expected_acv(snapshot) + assert result["expected_acv"].min() >= ACV_FLOOR + assert result["expected_acv"].max() <= ACV_CAP + + def test_values_within_range_unchanged(self): + snapshot = _make_snapshot() + snapshot["expected_acv"] = 50_000.0 + result = cap_expected_acv(snapshot) + assert (result["expected_acv"] == 50_000.0).all() + + def test_does_not_modify_input(self): + snapshot = _make_snapshot() + original = snapshot.copy() + cap_expected_acv(snapshot) + pd.testing.assert_frame_equal(snapshot, original) + + +# --------------------------------------------------------------------------- +# Tests — rename_and_select +# --------------------------------------------------------------------------- + + +class TestRenameAndSelect: + def test_output_columns_match_final(self): + df = _make_v5_df() + assert list(df.columns) == _FINAL_COLUMNS + + def test_converted_is_int(self): + df = _make_v5_df() + assert df["converted"].dtype in (np.int64, np.int32, int) + assert set(df["converted"].unique()).issubset({0, 1}) + + def test_missing_column_raises(self): + snapshot = _make_snapshot() + snapshot = derive_binary_features(snapshot) + snapshot = cap_expected_acv(snapshot) + # Drop a required source column + snapshot = snapshot.drop(columns=["industry"]) + with pytest.raises(ValueError, match="Missing required columns"): + rename_and_select(snapshot) + + def test_rename_mapping_applied(self): + snapshot = _make_snapshot() + df = derive_binary_features(snapshot) + df = cap_expected_acv(df) + result = rename_and_select(df) + # All renamed columns should exist in output + for new_name in _RENAME_MAP.values(): + assert new_name in result.columns + + +# --------------------------------------------------------------------------- +# Tests — subsample +# --------------------------------------------------------------------------- + + +class TestSubsample: + def test_output_size(self): + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + result = subsample(df, rng, n=100, target_rate=0.30) + assert len(result) == 100 + + def test_target_rate_approximate(self): + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + result = subsample(df, rng, n=200, target_rate=0.30) + actual_rate = result["converted"].mean() + assert actual_rate == pytest.approx(0.30, abs=0.01) + + def test_deterministic_given_seed(self): + df = _make_v5_df(n=500) + r1 = subsample(df, np.random.RandomState(42), n=100, target_rate=0.30) + r2 = subsample(df, np.random.RandomState(42), n=100, target_rate=0.30) + pd.testing.assert_frame_equal(r1, r2) + + def test_insufficient_positives(self, capsys): + """When fewer positives available than needed, warns and adjusts.""" + df = _make_v5_df(n=200, conversion_rate=0.05) # only ~10 positives + rng = np.random.RandomState(42) + result = subsample(df, rng, n=100, target_rate=0.50) # need 50 positives + captured = capsys.readouterr() + assert "WARNING" in captured.err + # All available positives should be included + assert result["converted"].sum() <= 10 + + def test_insufficient_negatives(self, capsys): + """When fewer negatives available than needed, warns and adjusts.""" + df = _make_v5_df(n=200, conversion_rate=0.95) # only ~10 negatives + rng = np.random.RandomState(42) + subsample(df, rng, n=100, target_rate=0.10) # need 90 negatives + captured = capsys.readouterr() + assert "WARNING" in captured.err + + def test_index_is_reset(self): + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + result = subsample(df, rng, n=100, target_rate=0.30) + assert list(result.index) == list(range(len(result))) + + def test_rows_come_from_input(self): + """All subsampled rows should exist in the original.""" + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + result = subsample(df, rng, n=100, target_rate=0.30) + # Check a non-index column for membership + for val in result["expected_acv"]: + assert val in df["expected_acv"].values + + +# --------------------------------------------------------------------------- +# Tests — inject_missingness +# --------------------------------------------------------------------------- + + +class TestInjectMissingness: + def test_web_sessions_has_missing(self): + df = _make_v5_df(n=1000) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + assert result["web_sessions"].isna().sum() > 0 + + def test_seniority_has_missing(self): + df = _make_v5_df(n=1000) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + assert result["seniority"].isna().sum() > 0 + + def test_days_since_last_touch_has_missing(self): + df = _make_v5_df(n=1000) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + assert result["days_since_last_touch"].isna().sum() > 0 + + def test_days_since_first_touch_has_missing(self): + df = _make_v5_df(n=1000) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + assert result["days_since_first_touch"].isna().sum() > 0 + + def test_missingness_rates_bounded(self): + """Each column's missingness rate should stay under ~20% (well above contract <10%).""" + df = _make_v5_df(n=2000) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + for col in ["web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch"]: + rate = result[col].isna().mean() + assert rate < 0.20, f"{col} missingness rate {rate:.2%} exceeds 20%" + + def test_other_columns_not_affected(self): + """Columns not in the missingness spec should have no new NaN.""" + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + no_miss_cols = [ + c + for c in _FINAL_COLUMNS + if c + not in ("web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch") + ] + for col in no_miss_cols: + orig_nan = df[col].isna().sum() + new_nan = result[col].isna().sum() + assert new_nan == orig_nan, f"{col} gained unexpected NaN" + + def test_does_not_modify_input(self): + df = _make_v5_df(n=500) + original = df.copy() + rng = np.random.RandomState(42) + inject_missingness(df, rng) + pd.testing.assert_frame_equal(df, original) + + def test_deterministic_given_seed(self): + df = _make_v5_df(n=500) + r1 = inject_missingness(df, np.random.RandomState(42)) + r2 = inject_missingness(df, np.random.RandomState(42)) + pd.testing.assert_frame_equal(r1, r2) + + def test_web_sessions_missingness_varies_by_source(self): + """SDR outbound should have higher web_sessions missingness than inbound marketing.""" + df = _make_v5_df(n=3000) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + sdr_rate = result.loc[df["lead_source"] == "sdr_outbound", "web_sessions"].isna().mean() + inbound_rate = ( + result.loc[df["lead_source"] == "inbound_marketing", "web_sessions"].isna().mean() + ) + assert sdr_rate > inbound_rate + + +# --------------------------------------------------------------------------- +# Tests — boost_leakage_trap +# --------------------------------------------------------------------------- + + +class TestBoostLeakageTrap: + def test_only_converted_leads_boosted(self): + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + trap_col = "__leakage__total_touches_90d" + original_trap = df[trap_col].copy() + result = boost_leakage_trap(df, rng) + # Non-converted leads should be unchanged + neg_mask = df["converted"] == 0 + pd.testing.assert_series_equal( + result.loc[neg_mask, trap_col], + original_trap[neg_mask], + check_names=False, + ) + + def test_converted_leads_get_higher_or_equal(self): + df = _make_v5_df(n=500) + rng = np.random.RandomState(42) + trap_col = "__leakage__total_touches_90d" + original_trap = df[trap_col].copy() + result = boost_leakage_trap(df, rng) + pos_mask = df["converted"] == 1 + assert (result.loc[pos_mask, trap_col] >= original_trap[pos_mask]).all() + + def test_does_not_modify_input(self): + df = _make_v5_df(n=500) + original = df.copy() + rng = np.random.RandomState(42) + boost_leakage_trap(df, rng) + pd.testing.assert_frame_equal(df, original) + + def test_deterministic_given_seed(self): + df = _make_v5_df(n=500) + r1 = boost_leakage_trap(df, np.random.RandomState(42)) + r2 = boost_leakage_trap(df, np.random.RandomState(42)) + pd.testing.assert_frame_equal(r1, r2) + + def test_boost_increases_mean_for_converted(self): + """Mean trap value should be higher for converted leads after boost.""" + df = _make_v5_df(n=1000) + rng = np.random.RandomState(42) + trap_col = "__leakage__total_touches_90d" + before_mean = df.loc[df["converted"] == 1, trap_col].mean() + result = boost_leakage_trap(df, rng) + after_mean = result.loc[result["converted"] == 1, trap_col].mean() + assert after_mean > before_mean diff --git a/tests/scripts/test_validate_cli.py b/tests/scripts/test_validate_cli.py new file mode 100644 index 0000000..a313eb2 --- /dev/null +++ b/tests/scripts/test_validate_cli.py @@ -0,0 +1,180 @@ +"""Tests for scripts/validate_lead_scoring_dataset.py CLI entrypoint.""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import numpy as np +import pandas as pd + +# --------------------------------------------------------------------------- +# Import the script module +# --------------------------------------------------------------------------- +_SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "validate_lead_scoring_dataset.py" + + +def _make_valid_csv(path: Path, n: int = 400, seed: int = 42) -> Path: + """Write a small CSV that passes validation (including baseline AUC ≥ 0.62). + + Injects real feature-target correlation so the LR baseline achieves a + reasonable AUC despite the small sample size. + """ + rng = np.random.RandomState(seed) + + # Generate a latent score and derive conversion from it, ensuring signal. + # Shift bias so base rate ≈ 30%. + latent = rng.normal(0, 1, size=n) + prob = 1 / (1 + np.exp(-(1.5 * latent - 0.85))) # shifted sigmoid + converted = (rng.random(n) < prob).astype(int) + + # Correlated numeric features (positive correlation with latent). + inbound = np.clip(rng.poisson(3, size=n) + (latent * 1.5).astype(int), 0, None) + web_sessions = np.clip(rng.poisson(4, size=n) + (latent * 1.0).astype(int), 0, None).astype( + float + ) + demo_completed = (latent + rng.normal(0, 0.8, size=n) > 0.3).astype(int) + opp_created = (latent + rng.normal(0, 0.8, size=n) > 0.0).astype(int) + + df = pd.DataFrame( + { + "industry": rng.choice( + ["manufacturing", "logistics", "services", "healthcare"], size=n + ), + "region": rng.choice(["US", "UK"], size=n), + "company_size": rng.choice(["200-499", "500-999", "1000-1999", "2000+"], size=n), + "company_revenue": rng.choice( + ["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"], size=n + ), + "contact_role": rng.choice( + ["finance", "ap_manager", "it_director", "procurement"], size=n + ), + "seniority": rng.choice( + ["individual_contributor", "manager", "director", "vp", "c_suite"], size=n + ), + "lead_source": rng.choice( + ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n + ), + "opportunity_created": opp_created, + "demo_completed": demo_completed, + "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0), + "inbound_touches": inbound, + "outbound_touches": rng.poisson(2, size=n), + "touches_week_1": rng.poisson(2, size=n), + "days_since_first_touch": rng.uniform(0, 14, size=n).round(1), + "web_sessions": web_sessions, + "sales_activities": rng.poisson(3, size=n), + "days_since_last_touch": rng.uniform(0, 14, size=n).round(1), + "__leakage__total_touches_90d": converted * rng.poisson(8, size=n) + + rng.poisson(3, size=n), + "converted": converted, + } + ) + # Inject small missingness to be realistic + miss_idx = rng.choice(n, size=int(n * 0.05), replace=False) + df.loc[miss_idx, "web_sessions"] = np.nan + + csv_path = path / "valid.csv" + df.to_csv(csv_path, index=False) + return csv_path + + +def _make_invalid_csv(path: Path) -> Path: + """Write a CSV missing the target column (will fail validation).""" + df = pd.DataFrame({"industry": ["a", "b"], "region": ["US", "UK"]}) + csv_path = path / "invalid.csv" + df.to_csv(csv_path, index=False) + return csv_path + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestValidateCLI: + def test_valid_csv_exit_code_zero(self, tmp_path): + csv_path = _make_valid_csv(tmp_path) + result = subprocess.run( # noqa: S603 + [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, f"stdout: {result.stdout}\nstderr: {result.stderr}" + + def test_invalid_csv_exit_code_one(self, tmp_path): + csv_path = _make_invalid_csv(tmp_path) + result = subprocess.run( # noqa: S603 + [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 1 + + def test_out_json_flag(self, tmp_path): + csv_path = _make_valid_csv(tmp_path) + json_path = tmp_path / "report.json" + subprocess.run( # noqa: S603 + [ + sys.executable, + str(_SCRIPT_PATH), + "--csv", + str(csv_path), + "--out-json", + str(json_path), + ], + capture_output=True, + text=True, + timeout=120, + ) + # JSON report should be written regardless of pass/fail + assert json_path.exists() + report = json.loads(json_path.read_text()) + assert "passed" in report + assert "checks" in report + + def test_emit_release_snippet_flag(self, tmp_path): + csv_path = _make_valid_csv(tmp_path) + result = subprocess.run( # noqa: S603 + [ + sys.executable, + str(_SCRIPT_PATH), + "--csv", + str(csv_path), + "--emit-release-snippet", + ], + capture_output=True, + text=True, + timeout=120, + ) + # Snippet should be emitted regardless of pass/fail + assert "RELEASE SNIPPET" in result.stdout + + def test_enforce_1000_flag_fails_on_small(self, tmp_path): + csv_path = _make_valid_csv(tmp_path, n=200) + result = subprocess.run( # noqa: S603 + [ + sys.executable, + str(_SCRIPT_PATH), + "--csv", + str(csv_path), + "--enforce-1000", + ], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 1 + + def test_missing_csv_arg_fails(self): + result = subprocess.run( # noqa: S603 + [sys.executable, str(_SCRIPT_PATH)], + capture_output=True, + text=True, + timeout=30, + ) + assert result.returncode != 0 From f087d764ce3e7ad8812736f062ed47c62c95f5a6 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Thu, 30 Apr 2026 19:59:39 +0300 Subject: [PATCH 2/2] =?UTF-8?q?refactor:=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20shared=20fixtures,=20stronger=20assertions,=20bound?= =?UTF-8?q?ary=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract shared `make_v5_dataset()` builder into `tests/conftest.py`, eliminating duplicate synthetic data generators across 3 test files - Delete 4 tautological missingness tests (trivially true at n=1000) and the weak `test_rows_come_from_input` (O(n^2), tests pandas internals) - Strengthen `test_insufficient_negatives` to assert output composition - Add `@pytest.mark.parametrize` for subsample target rate/seed combos and missingness rate bounds across multiple seeds - Add boundary tests: `subsample` with n > input, `boost_leakage_trap` with zero converted leads, `inject_missingness` with small n and unknown lead sources, `rename_and_select` with extra columns - Use session-scoped fixtures for CLI tests (avoid regenerating CSV 4x) - Add precondition assertion on fixture conversion rate so sigmoid drift produces a clear error message instead of mysterious CLI failure Co-Authored-By: Claude Opus 4.6 --- tests/conftest.py | 143 ++++++++++++++++++++++ tests/scripts/test_build_v5_snapshot.py | 146 ++++++++++++++--------- tests/scripts/test_validate_cli.py | 117 ++++++------------ tests/validation/test_lead_scoring.py | 150 ++++++------------------ 4 files changed, 305 insertions(+), 251 deletions(-) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..60b1503 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,143 @@ +"""Shared test fixtures and helpers for leadforge tests.""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pandas as pd + +# --------------------------------------------------------------------------- +# Canonical v5 column set (post-rename) +# --------------------------------------------------------------------------- + +V5_COLUMNS = [ + "industry", + "region", + "company_size", + "company_revenue", + "contact_role", + "seniority", + "lead_source", + "opportunity_created", + "demo_completed", + "expected_acv", + "inbound_touches", + "outbound_touches", + "touches_week_1", + "days_since_first_touch", + "web_sessions", + "sales_activities", + "days_since_last_touch", + "__leakage__total_touches_90d", + "converted", +] + +# Category value pools shared across all synthetic data builders. +INDUSTRIES = ["manufacturing", "logistics", "services", "healthcare"] +REGIONS = ["US", "UK"] +COMPANY_SIZES = ["200-499", "500-999", "1000-1999", "2000+"] +COMPANY_REVENUES = ["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"] +CONTACT_ROLES = ["finance", "ap_manager", "it_director", "procurement"] +SENIORITIES = ["individual_contributor", "manager", "director", "vp", "c_suite"] +LEAD_SOURCES = ["inbound_marketing", "sdr_outbound", "partner_referral"] + + +def make_v5_dataset( + n: int = 200, + conversion_rate: float = 0.30, + include_leakage: bool = True, + deterministic_col: bool = False, + inject_signal: bool = False, + seed: int = 99, +) -> pd.DataFrame: + """Build a synthetic dataset in v5 column format. + + Parameters + ---------- + n : int + Number of rows. + conversion_rate : float + Approximate target conversion rate (exact when inject_signal=False). + include_leakage : bool + Whether to include __leakage__total_touches_90d column. + deterministic_col : bool + If True, add a "bad_feature" column that perfectly predicts conversion + for a subgroup (useful for testing group determinism checks). + inject_signal : bool + If True, generate features that are correlated with the target so + that a logistic regression baseline can achieve AUC >= 0.62. The + conversion rate is approximate (~30%) rather than exact. + seed : int + Random seed for reproducibility. + """ + rng = np.random.RandomState(seed) + + if inject_signal: + # Latent score drives both features and outcome. + # Bias of -0.85 targets ~30% base rate under the sigmoid. + latent = rng.normal(0, 1, size=n) + prob = 1 / (1 + np.exp(-(1.5 * latent - 0.85))) + converted = (rng.random(n) < prob).astype(int) + + inbound = np.clip(rng.poisson(3, size=n) + (latent * 1.5).astype(int), 0, None) + web_sessions = np.clip(rng.poisson(4, size=n) + (latent * 1.0).astype(int), 0, None).astype( + float + ) + demo_completed = (latent + rng.normal(0, 0.8, size=n) > 0.3).astype(int) + opp_created = (latent + rng.normal(0, 0.8, size=n) > 0.0).astype(int) + else: + n_pos = int(n * conversion_rate) + n_neg = n - n_pos + converted = np.array([1] * n_pos + [0] * n_neg) + rng.shuffle(converted) + + inbound = rng.poisson(3, size=n) + web_sessions = rng.poisson(4, size=n).astype(float) + demo_completed = rng.randint(0, 2, size=n) + opp_created = rng.randint(0, 2, size=n) + + df = pd.DataFrame( + { + "industry": rng.choice(INDUSTRIES, size=n), + "region": rng.choice(REGIONS, size=n), + "company_size": rng.choice(COMPANY_SIZES, size=n), + "company_revenue": rng.choice(COMPANY_REVENUES, size=n), + "contact_role": rng.choice(CONTACT_ROLES, size=n), + "seniority": rng.choice(SENIORITIES, size=n), + "lead_source": rng.choice(LEAD_SOURCES, size=n), + "opportunity_created": opp_created, + "demo_completed": demo_completed, + "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0), + "inbound_touches": inbound, + "outbound_touches": rng.poisson(2, size=n), + "touches_week_1": rng.poisson(2, size=n), + "days_since_first_touch": rng.uniform(0, 14, size=n).round(1), + "web_sessions": web_sessions, + "sales_activities": rng.poisson(3, size=n), + "days_since_last_touch": rng.uniform(0, 14, size=n).round(1), + "converted": converted, + } + ) + + # Inject some missingness + miss_idx = rng.choice(n, size=int(n * 0.05), replace=False) + df.loc[miss_idx, "web_sessions"] = np.nan + + if include_leakage: + noise = rng.poisson(3, size=n) + df["__leakage__total_touches_90d"] = converted * rng.poisson(8, size=n) + noise + + if deterministic_col: + df["bad_feature"] = "normal" + df.loc[:59, "bad_feature"] = "leaked" + df.loc[:59, "converted"] = 1 + + return df + + +def save_csv(df: pd.DataFrame, tmp_path: Path, name: str = "data.csv") -> Path: + """Write a DataFrame to CSV and return the path.""" + path = tmp_path / name + df.to_csv(path, index=False) + return path diff --git a/tests/scripts/test_build_v5_snapshot.py b/tests/scripts/test_build_v5_snapshot.py index 25c8e71..0772fb1 100644 --- a/tests/scripts/test_build_v5_snapshot.py +++ b/tests/scripts/test_build_v5_snapshot.py @@ -43,9 +43,10 @@ def _make_snapshot( conversion_rate: float = 0.30, seed: int = 42, ) -> pd.DataFrame: - """Build a minimal snapshot DataFrame that looks like build_snapshot() output. + """Build a minimal snapshot DataFrame with pre-rename column names. - Contains the pre-rename column names expected by the pipeline steps. + This is distinct from the shared ``make_v5_dataset`` because it uses + the *pre-rename* columns that ``build_snapshot()`` actually produces. """ rng = np.random.RandomState(seed) n_pos = int(n * conversion_rate) @@ -167,7 +168,6 @@ def test_missing_column_raises(self): snapshot = _make_snapshot() snapshot = derive_binary_features(snapshot) snapshot = cap_expected_acv(snapshot) - # Drop a required source column snapshot = snapshot.drop(columns=["industry"]) with pytest.raises(ValueError, match="Missing required columns"): rename_and_select(snapshot) @@ -177,10 +177,19 @@ def test_rename_mapping_applied(self): df = derive_binary_features(snapshot) df = cap_expected_acv(df) result = rename_and_select(df) - # All renamed columns should exist in output for new_name in _RENAME_MAP.values(): assert new_name in result.columns + def test_extra_columns_are_dropped(self): + """Columns not in _FINAL_COLUMNS should be silently dropped.""" + snapshot = _make_snapshot() + snapshot["extra_col"] = 999 + df = derive_binary_features(snapshot) + df = cap_expected_acv(df) + result = rename_and_select(df) + assert "extra_col" not in result.columns + assert list(result.columns) == _FINAL_COLUMNS + # --------------------------------------------------------------------------- # Tests — subsample @@ -194,12 +203,16 @@ def test_output_size(self): result = subsample(df, rng, n=100, target_rate=0.30) assert len(result) == 100 - def test_target_rate_approximate(self): - df = _make_v5_df(n=500) - rng = np.random.RandomState(42) - result = subsample(df, rng, n=200, target_rate=0.30) + @pytest.mark.parametrize( + ("target_rate", "seed"), + [(0.30, 42), (0.30, 99), (0.20, 42), (0.40, 7)], + ) + def test_target_rate_approximate(self, target_rate, seed): + df = _make_v5_df(n=500, seed=seed) + rng = np.random.RandomState(seed) + result = subsample(df, rng, n=200, target_rate=target_rate) actual_rate = result["converted"].mean() - assert actual_rate == pytest.approx(0.30, abs=0.01) + assert actual_rate == pytest.approx(target_rate, abs=0.01) def test_deterministic_given_seed(self): df = _make_v5_df(n=500) @@ -220,10 +233,16 @@ def test_insufficient_positives(self, capsys): def test_insufficient_negatives(self, capsys): """When fewer negatives available than needed, warns and adjusts.""" df = _make_v5_df(n=200, conversion_rate=0.95) # only ~10 negatives + n_neg_available = (df["converted"] == 0).sum() rng = np.random.RandomState(42) - subsample(df, rng, n=100, target_rate=0.10) # need 90 negatives + result = subsample(df, rng, n=100, target_rate=0.10) # need 90 negatives captured = capsys.readouterr() assert "WARNING" in captured.err + # Verify actual composition: negatives capped at available count + n_neg_result = (result["converted"] == 0).sum() + assert n_neg_result <= n_neg_available + # Output should still contain rows (not empty) + assert len(result) > 0 def test_index_is_reset(self): df = _make_v5_df(n=500) @@ -231,14 +250,15 @@ def test_index_is_reset(self): result = subsample(df, rng, n=100, target_rate=0.30) assert list(result.index) == list(range(len(result))) - def test_rows_come_from_input(self): - """All subsampled rows should exist in the original.""" - df = _make_v5_df(n=500) + def test_n_larger_than_input_caps_gracefully(self, capsys): + """Requesting more rows than available caps at available count.""" + df = _make_v5_df(n=50) rng = np.random.RandomState(42) - result = subsample(df, rng, n=100, target_rate=0.30) - # Check a non-index column for membership - for val in result["expected_acv"]: - assert val in df["expected_acv"].values + result = subsample(df, rng, n=200, target_rate=0.30) + captured = capsys.readouterr() + assert "WARNING" in captured.err + # Output should contain all available rows (capped) + assert len(result) <= len(df) # --------------------------------------------------------------------------- @@ -247,36 +267,18 @@ def test_rows_come_from_input(self): class TestInjectMissingness: - def test_web_sessions_has_missing(self): - df = _make_v5_df(n=1000) - rng = np.random.RandomState(42) + @pytest.mark.parametrize("seed", [42, 99, 7]) + def test_missingness_rates_bounded(self, seed): + """Each column's missingness rate should stay under 20% across seeds.""" + df = _make_v5_df(n=2000, seed=seed) + rng = np.random.RandomState(seed) result = inject_missingness(df, rng) - assert result["web_sessions"].isna().sum() > 0 - - def test_seniority_has_missing(self): - df = _make_v5_df(n=1000) - rng = np.random.RandomState(42) - result = inject_missingness(df, rng) - assert result["seniority"].isna().sum() > 0 - - def test_days_since_last_touch_has_missing(self): - df = _make_v5_df(n=1000) - rng = np.random.RandomState(42) - result = inject_missingness(df, rng) - assert result["days_since_last_touch"].isna().sum() > 0 - - def test_days_since_first_touch_has_missing(self): - df = _make_v5_df(n=1000) - rng = np.random.RandomState(42) - result = inject_missingness(df, rng) - assert result["days_since_first_touch"].isna().sum() > 0 - - def test_missingness_rates_bounded(self): - """Each column's missingness rate should stay under ~20% (well above contract <10%).""" - df = _make_v5_df(n=2000) - rng = np.random.RandomState(42) - result = inject_missingness(df, rng) - for col in ["web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch"]: + for col in [ + "web_sessions", + "seniority", + "days_since_last_touch", + "days_since_first_touch", + ]: rate = result[col].isna().mean() assert rate < 0.20, f"{col} missingness rate {rate:.2%} exceeds 20%" @@ -285,16 +287,17 @@ def test_other_columns_not_affected(self): df = _make_v5_df(n=500) rng = np.random.RandomState(42) result = inject_missingness(df, rng) - no_miss_cols = [ - c - for c in _FINAL_COLUMNS - if c - not in ("web_sessions", "seniority", "days_since_last_touch", "days_since_first_touch") - ] - for col in no_miss_cols: - orig_nan = df[col].isna().sum() - new_nan = result[col].isna().sum() - assert new_nan == orig_nan, f"{col} gained unexpected NaN" + miss_cols = { + "web_sessions", + "seniority", + "days_since_last_touch", + "days_since_first_touch", + } + for col in _FINAL_COLUMNS: + if col not in miss_cols: + orig_nan = df[col].isna().sum() + new_nan = result[col].isna().sum() + assert new_nan == orig_nan, f"{col} gained unexpected NaN" def test_does_not_modify_input(self): df = _make_v5_df(n=500) @@ -310,7 +313,7 @@ def test_deterministic_given_seed(self): pd.testing.assert_frame_equal(r1, r2) def test_web_sessions_missingness_varies_by_source(self): - """SDR outbound should have higher web_sessions missingness than inbound marketing.""" + """SDR outbound should have higher web_sessions missingness than inbound.""" df = _make_v5_df(n=3000) rng = np.random.RandomState(42) result = inject_missingness(df, rng) @@ -320,6 +323,24 @@ def test_web_sessions_missingness_varies_by_source(self): ) assert sdr_rate > inbound_rate + def test_small_n_no_crash(self): + """Should not crash on small DataFrames, even with sparse lead sources.""" + df = _make_v5_df(n=10) + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + assert len(result) == 10 + + def test_no_matching_lead_source(self): + """If no rows match a source-conditional rate, no crash or extra NaN.""" + df = _make_v5_df(n=100) + # Force all lead_source to a value not in the missingness spec + df["lead_source"] = "direct" + rng = np.random.RandomState(42) + result = inject_missingness(df, rng) + # web_sessions should only have missingness from other sources (none here) + # but days_since_last_touch still gets 3% MCAR + assert len(result) == 100 + # --------------------------------------------------------------------------- # Tests — boost_leakage_trap @@ -333,7 +354,6 @@ def test_only_converted_leads_boosted(self): trap_col = "__leakage__total_touches_90d" original_trap = df[trap_col].copy() result = boost_leakage_trap(df, rng) - # Non-converted leads should be unchanged neg_mask = df["converted"] == 0 pd.testing.assert_series_equal( result.loc[neg_mask, trap_col], @@ -372,3 +392,13 @@ def test_boost_increases_mean_for_converted(self): result = boost_leakage_trap(df, rng) after_mean = result.loc[result["converted"] == 1, trap_col].mean() assert after_mean > before_mean + + def test_zero_converted_leads_no_change(self): + """When no leads are converted, trap values should be unchanged.""" + df = _make_v5_df(n=200, conversion_rate=0.30) + df["converted"] = 0 # force all negative + rng = np.random.RandomState(42) + trap_col = "__leakage__total_touches_90d" + original = df[trap_col].copy() + result = boost_leakage_trap(df, rng) + pd.testing.assert_series_equal(result[trap_col], original, check_names=False) diff --git a/tests/scripts/test_validate_cli.py b/tests/scripts/test_validate_cli.py index a313eb2..0fd31f3 100644 --- a/tests/scripts/test_validate_cli.py +++ b/tests/scripts/test_validate_cli.py @@ -7,8 +7,10 @@ import sys from pathlib import Path -import numpy as np import pandas as pd +import pytest + +from tests.conftest import make_v5_dataset, save_csv # --------------------------------------------------------------------------- # Import the script module @@ -16,77 +18,35 @@ _SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "validate_lead_scoring_dataset.py" -def _make_valid_csv(path: Path, n: int = 400, seed: int = 42) -> Path: - """Write a small CSV that passes validation (including baseline AUC ≥ 0.62). +# --------------------------------------------------------------------------- +# Session-scoped fixtures (avoid regenerating data per test) +# --------------------------------------------------------------------------- + - Injects real feature-target correlation so the LR baseline achieves a - reasonable AUC despite the small sample size. +@pytest.fixture(scope="session") +def valid_csv(tmp_path_factory: pytest.TempPathFactory) -> Path: + """Write a CSV that passes all validation checks. + + Uses ``inject_signal=True`` so the LR baseline achieves AUC >= 0.62 + (the shifted sigmoid with bias -0.85 targets ~30% conversion rate). """ - rng = np.random.RandomState(seed) - - # Generate a latent score and derive conversion from it, ensuring signal. - # Shift bias so base rate ≈ 30%. - latent = rng.normal(0, 1, size=n) - prob = 1 / (1 + np.exp(-(1.5 * latent - 0.85))) # shifted sigmoid - converted = (rng.random(n) < prob).astype(int) - - # Correlated numeric features (positive correlation with latent). - inbound = np.clip(rng.poisson(3, size=n) + (latent * 1.5).astype(int), 0, None) - web_sessions = np.clip(rng.poisson(4, size=n) + (latent * 1.0).astype(int), 0, None).astype( - float - ) - demo_completed = (latent + rng.normal(0, 0.8, size=n) > 0.3).astype(int) - opp_created = (latent + rng.normal(0, 0.8, size=n) > 0.0).astype(int) - - df = pd.DataFrame( - { - "industry": rng.choice( - ["manufacturing", "logistics", "services", "healthcare"], size=n - ), - "region": rng.choice(["US", "UK"], size=n), - "company_size": rng.choice(["200-499", "500-999", "1000-1999", "2000+"], size=n), - "company_revenue": rng.choice( - ["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"], size=n - ), - "contact_role": rng.choice( - ["finance", "ap_manager", "it_director", "procurement"], size=n - ), - "seniority": rng.choice( - ["individual_contributor", "manager", "director", "vp", "c_suite"], size=n - ), - "lead_source": rng.choice( - ["inbound_marketing", "sdr_outbound", "partner_referral"], size=n - ), - "opportunity_created": opp_created, - "demo_completed": demo_completed, - "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0), - "inbound_touches": inbound, - "outbound_touches": rng.poisson(2, size=n), - "touches_week_1": rng.poisson(2, size=n), - "days_since_first_touch": rng.uniform(0, 14, size=n).round(1), - "web_sessions": web_sessions, - "sales_activities": rng.poisson(3, size=n), - "days_since_last_touch": rng.uniform(0, 14, size=n).round(1), - "__leakage__total_touches_90d": converted * rng.poisson(8, size=n) - + rng.poisson(3, size=n), - "converted": converted, - } + tmp = tmp_path_factory.mktemp("validate_cli") + df = make_v5_dataset(n=400, inject_signal=True, seed=42) + # Precondition: conversion rate must be in [15%, 40%] for checks to pass + rate = df["converted"].mean() + assert 0.15 <= rate <= 0.40, ( + f"Fixture conversion rate {rate:.1%} outside [15%, 40%]; " + f"adjust sigmoid bias in make_v5_dataset(inject_signal=True)" ) - # Inject small missingness to be realistic - miss_idx = rng.choice(n, size=int(n * 0.05), replace=False) - df.loc[miss_idx, "web_sessions"] = np.nan - - csv_path = path / "valid.csv" - df.to_csv(csv_path, index=False) - return csv_path + return save_csv(df, tmp, "valid.csv") -def _make_invalid_csv(path: Path) -> Path: +@pytest.fixture(scope="session") +def invalid_csv(tmp_path_factory: pytest.TempPathFactory) -> Path: """Write a CSV missing the target column (will fail validation).""" + tmp = tmp_path_factory.mktemp("validate_cli_invalid") df = pd.DataFrame({"industry": ["a", "b"], "region": ["US", "UK"]}) - csv_path = path / "invalid.csv" - df.to_csv(csv_path, index=False) - return csv_path + return save_csv(df, tmp, "invalid.csv") # --------------------------------------------------------------------------- @@ -95,35 +55,32 @@ def _make_invalid_csv(path: Path) -> Path: class TestValidateCLI: - def test_valid_csv_exit_code_zero(self, tmp_path): - csv_path = _make_valid_csv(tmp_path) + def test_valid_csv_exit_code_zero(self, valid_csv): result = subprocess.run( # noqa: S603 - [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)], + [sys.executable, str(_SCRIPT_PATH), "--csv", str(valid_csv)], capture_output=True, text=True, timeout=120, ) assert result.returncode == 0, f"stdout: {result.stdout}\nstderr: {result.stderr}" - def test_invalid_csv_exit_code_one(self, tmp_path): - csv_path = _make_invalid_csv(tmp_path) + def test_invalid_csv_exit_code_one(self, invalid_csv): result = subprocess.run( # noqa: S603 - [sys.executable, str(_SCRIPT_PATH), "--csv", str(csv_path)], + [sys.executable, str(_SCRIPT_PATH), "--csv", str(invalid_csv)], capture_output=True, text=True, timeout=120, ) assert result.returncode == 1 - def test_out_json_flag(self, tmp_path): - csv_path = _make_valid_csv(tmp_path) + def test_out_json_flag(self, valid_csv, tmp_path): json_path = tmp_path / "report.json" subprocess.run( # noqa: S603 [ sys.executable, str(_SCRIPT_PATH), "--csv", - str(csv_path), + str(valid_csv), "--out-json", str(json_path), ], @@ -131,37 +88,33 @@ def test_out_json_flag(self, tmp_path): text=True, timeout=120, ) - # JSON report should be written regardless of pass/fail assert json_path.exists() report = json.loads(json_path.read_text()) assert "passed" in report assert "checks" in report - def test_emit_release_snippet_flag(self, tmp_path): - csv_path = _make_valid_csv(tmp_path) + def test_emit_release_snippet_flag(self, valid_csv): result = subprocess.run( # noqa: S603 [ sys.executable, str(_SCRIPT_PATH), "--csv", - str(csv_path), + str(valid_csv), "--emit-release-snippet", ], capture_output=True, text=True, timeout=120, ) - # Snippet should be emitted regardless of pass/fail assert "RELEASE SNIPPET" in result.stdout - def test_enforce_1000_flag_fails_on_small(self, tmp_path): - csv_path = _make_valid_csv(tmp_path, n=200) + def test_enforce_1000_flag_fails_on_small(self, valid_csv): result = subprocess.run( # noqa: S603 [ sys.executable, str(_SCRIPT_PATH), "--csv", - str(csv_path), + str(valid_csv), "--enforce-1000", ], capture_output=True, diff --git a/tests/validation/test_lead_scoring.py b/tests/validation/test_lead_scoring.py index 131dda1..d0b9023 100644 --- a/tests/validation/test_lead_scoring.py +++ b/tests/validation/test_lead_scoring.py @@ -20,102 +20,30 @@ _check_schema, validate_dataset, ) +from tests.conftest import make_v5_dataset, save_csv # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- -def _make_dataset( - n: int = 200, - conversion_rate: float = 0.30, - include_leakage: bool = True, - deterministic_col: bool = False, - seed: int = 99, -) -> pd.DataFrame: - """Build a small synthetic dataset that passes basic checks.""" - rng = np.random.RandomState(seed) - n_pos = int(n * conversion_rate) - n_neg = n - n_pos - - converted = np.array([1] * n_pos + [0] * n_neg) - rng.shuffle(converted) - - industries = rng.choice(["manufacturing", "logistics", "services", "healthcare"], size=n) - regions = rng.choice(["US", "UK"], size=n) - sizes = rng.choice(["200-499", "500-999", "1000-1999", "2000+"], size=n) - revenues = rng.choice(["$1M-$10M", "$10M-$50M", "$50M-$200M", "$200M+"], size=n) - roles = rng.choice(["finance", "ap_manager", "it_director", "procurement"], size=n) - seniority = rng.choice( - ["individual_contributor", "manager", "director", "vp", "c_suite"], size=n - ) - sources = rng.choice(["inbound_marketing", "sdr_outbound", "partner_referral"], size=n) - - df = pd.DataFrame( - { - "industry": industries, - "region": regions, - "company_size": sizes, - "company_revenue": revenues, - "contact_role": roles, - "seniority": seniority, - "lead_source": sources, - "opportunity_created": rng.randint(0, 2, size=n), - "demo_completed": rng.randint(0, 2, size=n), - "expected_acv": rng.uniform(18_000, 120_000, size=n).round(0), - "inbound_touches": rng.poisson(3, size=n), - "outbound_touches": rng.poisson(2, size=n), - "touches_week_1": rng.poisson(2, size=n), - "days_since_first_touch": rng.uniform(0, 14, size=n).round(1), - "web_sessions": rng.poisson(4, size=n).astype(float), - "sales_activities": rng.poisson(3, size=n), - "days_since_last_touch": rng.uniform(0, 14, size=n).round(1), - "converted": converted, - } - ) - - # Inject some missingness - miss_idx = rng.choice(n, size=int(n * 0.05), replace=False) - df.loc[miss_idx, "web_sessions"] = np.nan - - if include_leakage: - # Leakage: positively correlated with target - noise = rng.poisson(3, size=n) - df["__leakage__total_touches_90d"] = converted * rng.poisson(8, size=n) + noise - - if deterministic_col: - # Make a column that perfectly predicts conversion for a large group - df["bad_feature"] = "normal" - # First 60 rows all converted = 1 - df.loc[:59, "bad_feature"] = "leaked" - df.loc[:59, "converted"] = 1 - - return df - - -def _save(df: pd.DataFrame, tmp_path, name: str = "data.csv"): - path = tmp_path / name - df.to_csv(path, index=False) - return path - - @pytest.fixture def good_csv(tmp_path): """Write a well-formed synthetic dataset.""" - return _save(_make_dataset(n=200, include_leakage=True), tmp_path, "good.csv") + return save_csv(make_v5_dataset(n=200, include_leakage=True), tmp_path, "good.csv") @pytest.fixture def bad_deterministic_csv(tmp_path): """Write a dataset with a deterministic group.""" - return _save(_make_dataset(n=200, deterministic_col=True), tmp_path, "bad.csv") + return save_csv(make_v5_dataset(n=200, deterministic_col=True), tmp_path, "bad.csv") @pytest.fixture def no_target_csv(tmp_path): """Write a dataset missing the target column.""" - df = _make_dataset(n=200).drop(columns=["converted"]) - return _save(df, tmp_path, "no_target.csv") + df = make_v5_dataset(n=200).drop(columns=["converted"]) + return save_csv(df, tmp_path, "no_target.csv") # --------------------------------------------------------------------------- @@ -154,9 +82,9 @@ def test_missing_target_fails(self, no_target_csv): assert not target_check.passed def test_nan_target_short_circuits(self, tmp_path): - df = _make_dataset(n=200) + df = make_v5_dataset(n=200) df.loc[0, "converted"] = np.nan - path = _save(df, tmp_path, "nan_target.csv") + path = save_csv(df, tmp_path, "nan_target.csv") report = validate_dataset(path) # target_no_missing should fail no_miss = next(c for c in report.checks if c.name == "target_no_missing") @@ -165,18 +93,18 @@ def test_nan_target_short_circuits(self, tmp_path): assert report.baseline is None def test_nonbinary_target_short_circuits(self, tmp_path): - df = _make_dataset(n=200) + df = make_v5_dataset(n=200) df.loc[0, "converted"] = 2 - path = _save(df, tmp_path, "nonbinary.csv") + path = save_csv(df, tmp_path, "nonbinary.csv") report = validate_dataset(path) binary_check = next(c for c in report.checks if c.name == "target_binary") assert not binary_check.passed assert report.baseline is None def test_single_class_target_short_circuits(self, tmp_path): - df = _make_dataset(n=200) + df = make_v5_dataset(n=200) df["converted"] = 0 # all negatives - path = _save(df, tmp_path, "single_class.csv") + path = save_csv(df, tmp_path, "single_class.csv") report = validate_dataset(path) both = next(c for c in report.checks if c.name == "target_both_classes") assert not both.passed @@ -188,7 +116,7 @@ def test_target_both_classes_passes(self, good_csv): assert both.passed def test_banned_columns_detected(self, tmp_path): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df["current_stage"] = "active" cfg = ValidationConfig(enforce_row_count=False) checks = _check_schema(df, cfg) @@ -197,7 +125,7 @@ def test_banned_columns_detected(self, tmp_path): assert "current_stage" in banned.details def test_id_columns_detected(self, tmp_path): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df["lead_id"] = range(len(df)) cfg = ValidationConfig(enforce_row_count=False) checks = _check_schema(df, cfg) @@ -205,21 +133,21 @@ def test_id_columns_detected(self, tmp_path): assert not id_check.passed def test_enforce_row_count(self, tmp_path): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) cfg = ValidationConfig(enforce_row_count=True, expected_rows=1000) checks = _check_schema(df, cfg) rc = next(c for c in checks if c.name == "row_count") assert not rc.passed def test_exact_row_count_passes(self, tmp_path): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) cfg = ValidationConfig(enforce_row_count=True, expected_rows=200) checks = _check_schema(df, cfg) rc = next(c for c in checks if c.name == "row_count") assert rc.passed def test_duplicate_rows_detected(self, tmp_path): - df = _make_dataset(n=50, include_leakage=False) + df = make_v5_dataset(n=50, include_leakage=False) # Duplicate a lot of rows df = pd.concat([df, df], ignore_index=True) cfg = ValidationConfig(enforce_row_count=False) @@ -237,7 +165,7 @@ def test_missing_expected_features_warned(self): assert "missing" in feat.details def test_total_touches_all_naming(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df["total_touches_all"] = 5 cfg = ValidationConfig(enforce_row_count=False) checks = _check_schema(df, cfg) @@ -245,7 +173,7 @@ def test_total_touches_all_naming(self): assert not naming.passed def test_no_leakage_columns(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) cfg = ValidationConfig(enforce_row_count=False) checks = _check_schema(df, cfg) naming = next(c for c in checks if c.name == "leakage_naming") @@ -253,7 +181,7 @@ def test_no_leakage_columns(self): assert "no leakage" in naming.details def test_multiple_leakage_columns(self): - df = _make_dataset(n=200, include_leakage=True) + df = make_v5_dataset(n=200, include_leakage=True) df["__leakage__another"] = 1 cfg = ValidationConfig(enforce_row_count=False) checks = _check_schema(df, cfg) @@ -269,14 +197,14 @@ def test_multiple_leakage_columns(self): class TestMissingness: def test_high_missingness_fails(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df.loc[:40, "inbound_touches"] = np.nan # >20% missing cfg = ValidationConfig(max_col_missing_rate=0.10) checks, miss_map = _check_missingness(df, cfg) assert not checks[0].passed def test_low_missingness_passes(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) cfg = ValidationConfig(max_col_missing_rate=0.10) checks, _ = _check_missingness(df, cfg) assert checks[0].passed @@ -296,7 +224,7 @@ def test_deterministic_group_fails(self, bad_deterministic_csv): def test_low_conversion_group_fails(self, tmp_path): """A group where conversion rate is near 0% should also fail.""" - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df["bad_feature"] = "normal" # First 60 rows all converted = 0 for this group df.loc[:59, "bad_feature"] = "zero_group" @@ -320,12 +248,12 @@ def test_good_dataset_passes_determinism(self, good_csv): class TestConversionRate: def test_rate_outside_range_fails(self): # 5% conversion rate — below 15% - df = _make_dataset(n=200, conversion_rate=0.05, include_leakage=False) + df = make_v5_dataset(n=200, conversion_rate=0.05, include_leakage=False) checks = _check_conversion_rate(df) assert not checks[0].passed def test_rate_in_range_passes(self): - df = _make_dataset(n=200, conversion_rate=0.30, include_leakage=False) + df = make_v5_dataset(n=200, conversion_rate=0.30, include_leakage=False) checks = _check_conversion_rate(df) assert checks[0].passed @@ -337,31 +265,31 @@ def test_rate_in_range_passes(self): class TestACVRange: def test_no_acv_column_skips(self): - df = _make_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"]) + df = make_v5_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"]) checks = _check_acv_range(df) assert checks[0].passed assert "skip" in checks[0].details def test_acv_all_nan_fails(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df["expected_acv"] = np.nan checks = _check_acv_range(df) assert not checks[0].passed def test_acv_below_floor_fails(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df.loc[0, "expected_acv"] = 1000 # way below 18k checks = _check_acv_range(df) assert not checks[0].passed def test_acv_above_cap_fails(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df.loc[0, "expected_acv"] = 200_000 # way above 120k checks = _check_acv_range(df) assert not checks[0].passed def test_acv_in_range_passes(self): - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) checks = _check_acv_range(df) assert checks[0].passed @@ -414,8 +342,8 @@ def test_baseline_deterministic(self, good_csv): def test_k_larger_than_test_set_skipped(self, tmp_path): """If k > test set size, that k is skipped.""" - df = _make_dataset(n=20, include_leakage=False) - path = _save(df, tmp_path) + df = make_v5_dataset(n=20, include_leakage=False) + path = save_csv(df, tmp_path) # ks=(25, 50) but test set is only ~6 rows report = validate_dataset(path, ValidationConfig(enforce_row_count=False)) assert report.baseline is not None @@ -438,8 +366,8 @@ def test_trap_detected(self, good_csv): assert tm.mean_delta_auc > 0 def test_no_trap_columns_skips(self, tmp_path): - df = _make_dataset(n=200, include_leakage=False) - path = _save(df, tmp_path) + df = make_v5_dataset(n=200, include_leakage=False) + path = save_csv(df, tmp_path) report = validate_dataset(path, ValidationConfig(enforce_row_count=False)) trap_check = [c for c in report.checks if c.name.startswith("leakage_trap")] assert len(trap_check) == 1 @@ -448,11 +376,11 @@ def test_no_trap_columns_skips(self, tmp_path): def test_weak_trap_fails_checks(self, tmp_path): """A trap column with no signal should fail threshold checks.""" - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) rng = np.random.RandomState(42) # Add a random column with no leakage signal df["__leakage__noise"] = rng.poisson(5, size=len(df)) - path = _save(df, tmp_path) + path = save_csv(df, tmp_path) cfg = ValidationConfig( enforce_row_count=False, trap_mean_delta=0.05, # high threshold @@ -480,17 +408,17 @@ def test_value_metrics_computed(self, good_csv): def test_value_metrics_with_nan_acv(self, tmp_path): """NaN in expected_acv should not propagate NaN into value metrics.""" - df = _make_dataset(n=200, include_leakage=False) + df = make_v5_dataset(n=200, include_leakage=False) df.loc[:9, "expected_acv"] = np.nan - path = _save(df, tmp_path) + path = save_csv(df, tmp_path) report = validate_dataset(path, ValidationConfig(enforce_row_count=False)) for vm in report.value_metrics: assert not np.isnan(vm.captured_acv_by_prob) assert not np.isnan(vm.captured_acv_by_ev) def test_no_acv_column_returns_empty(self, tmp_path): - df = _make_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"]) - path = _save(df, tmp_path) + df = make_v5_dataset(n=200, include_leakage=False).drop(columns=["expected_acv"]) + path = save_csv(df, tmp_path) report = validate_dataset(path, ValidationConfig(enforce_row_count=False)) assert report.value_metrics == []