From 159757d8fa96e62a10a749804b34f682d69478c0 Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Wed, 1 Jul 2026 07:22:21 +0200
Subject: [PATCH 1/7] Add CodSpeed perf-regression benchmark suite

Nine modules over the duckdb-python binding hot paths: fetch (OUT-row),
arrow, pandas, produce_numpy (df/fetchnumpy columnar), ingest_native
(values/executemany), ingest_numpy (numpy scan + analyzer bind), udf (native +
vectorized arrow), types_roundtrip (type x direction matrix), cardinality
(LIMIT sweep). Full-consume discipline, warmup, real-null gotchas, tracemalloc
memory guard. See benchmarks/PLAN.md. Standalone (not yet wired into CI).
---
 benchmarks/PLAN.md                      | 178 +++++++++++++++++++++++
 benchmarks/test_arrow_perf.py           | 119 ++++++++++++++++
 benchmarks/test_cardinality_perf.py     |  63 +++++++++
 benchmarks/test_fetch_perf.py           | 141 ++++++++++++++++++
 benchmarks/test_ingest_native_perf.py   |  93 ++++++++++++
 benchmarks/test_ingest_numpy_perf.py    | 109 ++++++++++++++
 benchmarks/test_pandas_perf.py          | 133 +++++++++++++++++
 benchmarks/test_produce_numpy_perf.py   | 181 ++++++++++++++++++++++++
 benchmarks/test_types_roundtrip_perf.py |  78 ++++++++++
 benchmarks/test_udf_perf.py             | 110 ++++++++++++++
 10 files changed, 1205 insertions(+)
 create mode 100644 benchmarks/PLAN.md
 create mode 100644 benchmarks/test_arrow_perf.py
 create mode 100644 benchmarks/test_cardinality_perf.py
 create mode 100644 benchmarks/test_fetch_perf.py
 create mode 100644 benchmarks/test_ingest_native_perf.py
 create mode 100644 benchmarks/test_ingest_numpy_perf.py
 create mode 100644 benchmarks/test_pandas_perf.py
 create mode 100644 benchmarks/test_produce_numpy_perf.py
 create mode 100644 benchmarks/test_types_roundtrip_perf.py
 create mode 100644 benchmarks/test_udf_perf.py

diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md
new file mode 100644
index 00000000..c04f4801
--- /dev/null
+++ b/benchmarks/PLAN.md
@@ -0,0 +1,178 @@
+# CodSpeed Benchmark Suite Plan — duckdb-python binding hot paths
+
+Grounded in the binding source on `perf/codspeed` (`src/`). File:line citations are to this tree.
+
+## 0. Conventions (from the existing 3 modules, keep these)
+
+- Function-scoped `con` fixture; module-scoped input-data fixtures.
+- READ = `SELECT sum(col) / sum(length(col))` (never `count(*)`, which is answered from metadata).
+- WRITE = eager materialize or fully drain the lazy reader.
+- Warm the engine once (`con.execute(query).fetchall()`) before `benchmark(...)` so first-call import-cache population is not charged to the measured region.
+- Pin numpy/pandas/pyarrow/polars identically across A/B so deltas are pure binding cost.
+
+Ranking: **P0** = on a known regression path or the cutover-reworked code (narrow-numeric common case); **P1** = high-traffic conversion / per-element Python work; **P2** = correctness-relevant, lower traffic or engine-dominated.
+
+## (a) Prioritized scenarios
+
+### PRODUCE (duckdb -> external) — highest regression risk
+
+Row path: `DuckDBPyResult::Fetchone` (`src/pyresult.cpp:126-151`) builds a `PyUtil::TupleBuilder` (`src/include/duckdb_python/pyutil.hpp:101-125`) per row and calls `PythonObject::FromValue` (`src/native/python_objects.cpp:474`) per cell. O(rows x cols). This is the shape of the historical ~15% fetchall regression.
+
+| # | Scenario | SQL / setup | Measures | Pri |
+|---|----------|-------------|----------|-----|
+| P0-1 | fetchall int64 1col | `SELECT i::BIGINT a FROM range(1_000_000)` | TupleBuilder + FromValue int (`python_objects.cpp:489`) | P0 |
+| P0-2 | fetchall int 2-4col | `SELECT i::BIGINT,(i+1)::BIGINT,(i*2)::INTEGER FROM range(1_000_000)` | TupleBuilder scaling w/ col count | P0 |
+| P0-3 | fetchall double | `SELECT (i*1.5)::DOUBLE FROM range(1_000_000)` | FromValue double | P0 |
+| P0-4 | fetchall varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | FromValue VARCHAR string copy (`python_objects.cpp:515`) | P1 |
+| P0-5 | fetchone loop (overhead) | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(100_000)` | per-call Fetchone + chunk-boundary FetchNext + GIL cycle | P0 |
+| P0-6 | fetchmany batched | as P0-5, `fetchmany(10_000)` loop | Fetchmany loop | P1 |
+| P1-7 | **df() numeric (reworked)** | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(1_000_000)` | FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular, `HAS_NULLS=false/PANDAS=true` branch (`array_wrapper.cpp:415-425`) | P0 |
+| P1-8 | **df() numeric WITH NULLS** | `SELECT CASE WHEN i%10=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | `HAS_NULLS=true` + masked_array build (`array_wrapper.cpp:743-757`) + masked->pd.NA rewrite (`pyresult.cpp:362-393`) | P0 |
+| P1-9 | fetchnumpy numeric | as P1-7 | FetchNumpyInternal without the DataFrame wrap | P1 |
+| P1-10 | df() varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | StringConvert PyUnicode_FromStringAndSize per row (`array_wrapper.cpp:164-181`) | P1 |
+| P1-11 | df() timestamp | `SELECT TIMESTAMP '2020-01-01'+(i*INTERVAL 1 SECOND) FROM range(1_000_000)` | TimestampConvertNano + ConvertDateTimeTypes (`pyresult.cpp:299`) | P1 |
+| P1-13 | to_record_batch_reader drained | `range(1_000_000)`, `to_record_batch_reader(100_000)` | lazy stream (`pyresult.cpp:573`), iterate + sum num_rows | P1 |
+| P2-15 | torch()/tf() numeric | `range(500_000)` | FetchNumpyInternal + per-col from_numpy (`pyresult.cpp:405-421`) | P2 |
+| P2-16 | fetch_df_chunk | large query, loop `fetch_df_chunk()` | FetchDFChunk per chunk (`pyresult.cpp:400`) | P2 |
+| P1-17 | fetchall LIST<int> | `SELECT [i,i+1,i+2] FROM range(200_000)` | FromValue LIST recursion (`python_objects.cpp:651`) | P1 |
+| P1-18 | fetchall STRUCT | `SELECT {'a':i,'b':i+1} FROM range(200_000)` | FromStruct dict build (`python_objects.cpp:390-414`) | P1 |
+| P1-20 | fetchall DECIMAL | `SELECT (i::DECIMAL(18,3))/1000 FROM range(200_000)` | Python `Decimal()(val.ToString())` per row (`python_objects.cpp:507`) | P1 |
+| P1-21 | fetchall TIMESTAMPTZ | `SELECT (TIMESTAMPTZ '2020-01-01'+(i*INTERVAL 1 SECOND)) FROM range(100_000)` | pytz localize+astimezone per row (`python_objects.cpp:567-573`) | P1 |
+| P2-22 | fetchall NULL-heavy | `SELECT CASE WHEN i%2=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | validity branch + nb::none (`pyresult.cpp:142`) | P2 |
+| P2-23 | fetchall BLOB | `SELECT ('blob_'||i)::BLOB FROM range(200_000)` | nb::bytes (`python_objects.cpp:517`) | P2 |
+
+### INGEST (external -> duckdb)
+
+| # | Scenario | Setup | Path | Pri |
+|---|----------|-------|------|-----|
+| I0-1 | **pandas numpy int64/double** | DataFrame 1M | NumpyScan::Scan ScanNumpyMasked zero-copy when stride==sizeof(T); double NaN->NULL loop (`numpy_scan.cpp:76-112,236-246`) reworked | P0 |
+| I0-2 | **pandas numpy object-string** | `pd.array(strings,dtype=object)` 500k | NumpyScan STRING/OBJECT: per-row isinstance, PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode transcode (`numpy_scan.cpp:353-452`) reworked | P0 |
+| I1-3 | pandas object bind-time analyzer | object col 100k+ | Pandas::Bind -> PandasAnalyzer::Analyze samples rows GetItemType ladder (`analyzer.cpp:356-460`). Per-BIND overhead, independent of rows (count(*) ok here) | P1 |
+| I1-4 | pandas arrow-backed | pd.ArrowDtype 1M | ToArrowTable -> arrow scan (`pyconnection.cpp:1799`) | P1 |
+| I0-5 | arrow Table | 1M | CreateArrowScan PythonTableArrowArrayStreamFactory near-zero-copy (`python_replacement_scan.cpp:55-83`) | P1 |
+| I1-6 | arrow RecordBatchReader | from_batches | same factory, streaming (distinct from Table) | P1 |
+| I1-7 | polars DataFrame | 1M | entry.to_arrow() one-time + arrow scan (`replacement_scan.cpp:150-156`) | P2 |
+| I1-8 | numpy ndarray + dict-of-arrays | np.arange | replacement scan -> pandas_scan (`replacement_scan.cpp:163-200`) | P2 |
+| I1-9 | **native values() list-of-tuples** | `con.values([(i,i*1.5,'s') for i in range(100_000)])` | Values -> TransformPythonValue per cell, GetPythonObjectType ladder (`python_conversion.cpp:402-454,1075`) | P1 |
+| I1-10 | native list-of-dicts | list of dicts | TransformDictionaryToStruct recursion (`python_conversion.cpp:119`) | P2 |
+| I1-11 | executemany params | INSERT ?,?  100k sets | ExecuteMany loop, TransformPythonValue per set (`pyconnection.cpp:500-544`) | P2 |
+| I2-12 | read_parquet/csv/json | a file | arg marshal -> TableFunction under GIL-release; engine-dominated | P2 |
+
+### UDF (`src/python_udf.cpp`) — zero coverage today
+
+| # | Scenario | Setup | Path | Pri |
+|---|----------|-------|------|-----|
+| U0-1 | **scalar native 1 int arg** | `def f(x):return x+1`, `SELECT sum(f(i::BIGINT)) FROM range(1_000_000)` | per-row TupleBuilder args + PyObject_CallObject + TransformPythonObject result (`python_udf.cpp:320-384`) | P0 |
+| U0-2 | scalar native 2-3 args | `def f(a,b):return a+b` 2 cols 1M | arg-tuple scaling | P1 |
+| U1-3 | scalar native string | `def f(s):return s.upper()` 500k | VARCHAR in + string out | P1 |
+| U1-4 | scalar native NULL inputs | 50% NULL, DEFAULT handling | SetNull short-circuit (`python_udf.cpp:340-350`) | P1 |
+| U1-6 | **vectorized arrow UDF** | `type='arrow'` pc.add 1M | ConvertDataChunkToPyArrowTable + call + ConvertArrowTableToVector cast (`python_udf.cpp:33-144,225`) | P0 |
+| U2-7 | vectorized NULL slicing | DEFAULT + nulls | selvec compaction/reconstruction (`python_udf.cpp:197-305`) | P2 |
+
+## (b) Type x direction matrix
+
+Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper), OUT-arrow.
+
+| Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow |
+|------|-----------|----------|---------|---------|-----------|
+| int32/int64 | P1 | **P0** | **P0** | **P0** | P1 |
+| double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 |
+| varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 |
+| bool | P2 | P1 | P2 | P1 | P2 |
+| decimal | P2 | n/a | **P1** (Python Decimal) | P1 | P2 |
+| date | P2 | P1 | P1 | P1 | P2 |
+| timestamp | P1 | **P1** | P1 | P1 | P1 |
+| timestamptz | P2 | P1 | **P1** (pytz/row) | P1 | P2 |
+| time/interval | P2 | P1 | P1 | P1 | P2 |
+| LIST/ARRAY | P2 | P2 | P1 (recursive) | P1 | P2 |
+| STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 |
+| MAP | P2 | P2 | P2 | P2 | P2 |
+| blob | P2 | P2 | P2 | P2 | P2 |
+| NULL-heavy | - | **P1** | P2 | **P0** (masked_array) | P1 |
+| enum/category | - | P1 | P1 | P1 | P2 |
+
+Minimum viable to ship: int64, double, varchar, timestamp, decimal, LIST, STRUCT, NULL-heavy in OUT-row and OUT-col; int64/double/varchar in IN-numpy.
+
+## (c) Gaps vs the existing 3 modules
+
+Covered well: OUT-row narrow numeric, OUT-arrow/polars numeric+string, pandas IN/OUT numpy-vs-arrow numeric+string, fetchone-loop numeric.
+
+Missing:
+1. **PRODUCE columnar reworked path under-covered** — df() only 500k, only numeric/string, never with NULLS (the masked-array branch is exactly what changed). Add df-with-nulls, fetchnumpy, df-timestamp.
+2. **UDFs: zero coverage** — whole subsystem (python_udf.cpp), native per-row is the single biggest untested per-call-overhead path. Add U0-1/U0-2/U1-3/4/U1-6.
+3. **Native Python ingest: zero coverage** — values()/list-of-tuples/list-of-dicts/executemany via TransformPythonValue. Add I1-9/10/11.
+4. **Expensive scalar OUT-row types untested** — decimal, timestamptz, interval, isolated LIST/STRUCT/MAP. Add P1-17..21.
+5. **Object-column bind-time analyzer untested** — PandasAnalyzer sampling, per-bind cost. Add I1-3.
+6. **Size regimes thin** — add 1M throughput AND 1-row overhead variants.
+7. **Arrow ingest only pa.table** — add RecordBatchReader, polars, numpy-ndarray ingest.
+8. **NULL-heavy IN-numpy untested** (ScanNumpyMasked + ApplyMask).
+
+## (d) Suite organization + CodSpeed mechanics
+
+```
+benchmarks/
+  test_fetch_perf.py            # EXISTING — OUT-row. Add: nested, decimal, timestamptz, null-heavy, 1M+1-row
+  test_arrow_perf.py            # EXISTING — add RecordBatchReader ingest, materialized vs stream
+  test_pandas_perf.py           # EXISTING — add df()-with-nulls, datetime, fetchnumpy, analyzer bind
+  test_produce_numpy_perf.py    # NEW — df()/fetchnumpy/fetch_df_chunk reworked columnar, per-type, null vs no-null
+  test_ingest_native_perf.py    # NEW — values()/list-of-tuples/list-of-dicts/executemany
+  test_ingest_numpy_perf.py     # NEW — numpy ndarray / object-string scan / analyzer bind
+  test_udf_perf.py              # NEW — scalar native + vectorized arrow UDFs
+  test_types_roundtrip_perf.py  # NEW — type x direction matrix sweep, parametrized
+```
+One module per binding subsystem so a CodSpeed report points at one src/ area. torch/tf go in produce_numpy (wrap FetchNumpyInternal); polars stays in arrow (wraps FetchArrowTable).
+
+### Walltime vs instruction-count
+
+- **Local A/B (macOS arm64): walltime only** (no Valgrind), `--codspeed-mode=walltime`.
+- **CI gate: instruction-count / simulation (Linux + Callgrind)**, deterministic — gate PRs with this.
+
+Instruction-count is ideal AND should gate the GIL-held single-threaded overhead paths: fetchone loop, fetchall/fetchmany, native UDF per-call, native values() ingest, analyzer bind, all per-element converters (FromValue, TransformPythonValue, NumpyScan object/string, ArrayWrapper fill). The historical fetchall regression would be caught cleanly here.
+
+Noisy under instruction-count — keep walltime-only, informational, do NOT hard-gate:
+- to_arrow_table / pl() on materialized results: PromoteMaterializedToArrow re-runs the query parallel with GIL released (`pyresult.cpp:450-477`).
+- Large 1M+ SELECT sum() ingest reads: engine parallel aggregate dominates.
+- read_csv/parquet/json: engine + I/O dominated.
+- GIL-per-chunk streaming (FetchNextRaw, to_record_batch_reader drain).
+
+Gate tactic: pair each large-throughput scenario with a small/1-row variant (e.g. fetchall range(1_000_000) walltime + fetchall range(2048) instruction-count gate) so binding fixed-cost is measured noise-free.
+
+### Two code-grounded gotchas
+- **OUT-col null benchmarks need REAL DuckDB nulls** (`CASE WHEN ... THEN NULL`): the masked-array branch only triggers on an actually-invalid validity bit (`array_wrapper.cpp:396-404,736`); a no-null column silently takes the cheap `std::move` path and measures the wrong thing.
+- **IN-numpy string benchmarks need mixed ASCII + non-ASCII + a NaN/pd.NA/None sentinel**: the scan zero-copies compact-ASCII (`numpy_scan.cpp:416-418`) but transcodes otherwise (`numpy_scan.cpp:429-446`); ASCII-only misses the transcode + null-detection ladder.
+
+## (e) Cross-check vs iqmo-org/bareduckdb
+
+Source read live from `iqmo-org/bareduckdb` `main`, subdir `benchmark/` (GitHub API + raw files).
+
+### What their suite covers / how it is organized
+
+A **SQL-file-driven A/B harness comparing two clients** — production `duckdb` vs `bareduckdb` (the C-API / free-threading prototype) — not a binding micro-bench.
+
+- `benchmark.py` orchestrates: discovers `cases/**/*.sql`, picks the matching `data/DATA*` dir, and runs each `(sql x parquet-file x db_mode)` as a fresh `uv run run_case.py` **subprocess**. `DBMODES=[duckdb, bareduckdb_capsule, bareduckdb_arrow]`; active `READ_MODES=[arrow_table]` (parquet/arrow_reader present but off).
+- `run_case.py` per case: fresh `connect()`, `pyarrow.parquet.read_table(file)` + `conn.register(name, table)`, then `conn.sql(query).to_arrow_table()`, timed with `time.perf_counter()` and peak RSS via `resource.getrusage`. **No warmup, single run, result discarded.** Universal ingest = register(arrow table); universal produce = `to_arrow_table()`.
+- `data/`: `DATA_RANGE` = single BIGINT `range(N)` at 5M / 100M; `DATA_CATEGORY_DATE_PRICE` = (VARCHAR category, DATE, BIGINT price) cross-join at 36M / 3.6B.
+- `cases/`: `types/` (decimal `DECIMAL(28,12)`, hugeint `HUGEINT`, mixed_types `HUGEINT+uuid()+DECIMAL(28,6)+VARCHAR` in one row, timestamp `TIMESTAMP+INTERVAL`, varchar_long ~100-char), `limit/` (LIMIT 100 / 1k / 10k / 100k top-N — a result-cardinality sweep), `filter/`, `groups/`, `window/`, `threading/` (parallel group/window/self-join/registered-arrow-scan), plus a separate `stats/` harness.
+
+Their INGEST is arrow-only and their PRODUCE is arrow-only; they have **no** fetchall/fetchone, df()/numpy, pandas/numpy/native/polars ingest, or UDF coverage — so our binding suite is far broader on binding-specific surfaces. Their genuine deltas are concentrated in the PRODUCE/types dimension and in engine/threading workloads.
+
+### DELTA — actionable additions/changes
+
+- **[BINDING] Add HUGEINT to the produce matrix (currently absent).** `types/hugeint.sql`, `mixed_types.sql`. OUT-row `FromValue` HUGEINT does `PyLong_FromString(val.GetValue<string>())` — a per-value string round-trip (`python_objects.cpp:500`), unlike narrow int; OUT-col casts hugeint->double (`array_wrapper.cpp:662`); OUT-arrow is a distinct decimal128/int128 export. Scenario: `SELECT i::HUGEINT FROM range(1_000_000)` through fetchall / df / to_arrow_table. Add a `hugeint` row to the type x direction matrix.
+- **[BINDING] Add UUID to the produce matrix (absent).** `mixed_types.sql` selects `uuid()`. OUT-row builds a Python `uuid.UUID` per row (`python_objects.cpp:708-711`); OUT-col uses `UUIDConvert` (`array_wrapper.cpp:230-244`). Scenario: `SELECT gen_random_uuid() FROM range(200_000)` through fetchall / df / to_arrow_table. Add a `uuid` row to the matrix.
+- **[BINDING] Add a 128-bit-internal DECIMAL variant.** Our P1-20 uses `DECIMAL(18,3)` (int64 internal); bareduckdb uses `DECIMAL(28,12)` / `(28,6)` (int128 internal), hitting `ConvertDecimalInternal<hugeint_t>` (`array_wrapper.cpp:571`) and the wider `PyDecimalCastSwitch`/`Decimal()` round-trip. Run both an int64-internal and an int128-internal decimal.
+- **[BINDING] Add a heterogeneous mixed-type row (new scenario).** `SELECT i::HUGEINT, gen_random_uuid(), (i*1.5)::DECIMAL(28,6), ('string_'||i) FROM range(200_000)` through fetchall and df. Exercises per-cell type dispatch in the `Fetchone` column loop (`pyresult.cpp:140-148`) — a different branch/cache profile than our homogeneous columns (P0-1..3 are single-type).
+- **[BINDING] Add a long-varchar (>64 char) variant** alongside the short `'str_value_'||i`. `'...'||repeat('data ',10)||i::VARCHAR` (~100 chars). Short strings are copy-cheap/overhead-bound; long strings shift OUT-row/OUT-col string copy and the IN-numpy `DecodePythonUnicode` transcode (`numpy_scan.cpp:429-446`) toward copy-bound. Apply to OUT-row, OUT-col, IN-numpy varchar scenarios.
+- **[BINDING] Adopt their result-cardinality (top-N) sweep as a produce axis.** `SELECT * FROM <fixed source> ORDER BY k DESC LIMIT n` for n in {100, 1k, 10k, 100k}, fetched via fetchall / df / to_arrow_table with the source held constant. Holds engine work ~constant while sweeping rows-materialized-to-Python → a clean per-row conversion slope, and the small-n end is an ideal noise-free instruction-count gate (overhead regime). Cleaner than varying `range()` (which also changes scan cost).
+- **[BINDING] Broaden the OUT-arrow column of the matrix.** Their entire produce path is `to_arrow_table`, and they push hugeint / decimal128 / uuid / timestamp / long-varchar / mixed-row through it — exactly the arrow-export converters (ArrowConverter/appender for int128/uuid/decimal128) our OUT-arrow column currently leaves at P1/P2 numeric+string. Add these types to OUT-arrow.
+- **[BINDING, hard to gate] registered-arrow-scan under parallelism.** `threading/registered_arrow_scan.sql` pulls batches from `PythonTableArrowArrayStreamFactory::Produce` (binding code in `arrow/arrow_array_stream.cpp`) across engine threads holding/releasing the GIL — a real binding-contention risk. Keep as walltime-informational only; too noisy for an instruction-count gate.
+- **[ENGINE] `filter` / `groups` / `window` / `self_join` pure-engine workloads** — out of scope for a binding gate; the binding only wraps them with register + to_arrow_table, and their consume (a small aggregate) is trivial so the measurement is ~pure engine. Note, do not add to the binding suite.
+- **[ENGINE] 100M / 3.6B-row scale** — too slow / IO+engine-dominated / walltime-noisy for a codspeed gate; keep our regimes <= ~1M.
+- **[ENGINE] threading / free-threading category** — the production client does not support free-threading (CLAUDE.md); deprioritize for this suite.
+
+### Methodology notes for our codspeed mechanics
+
+- **Adopt: result-cardinality (LIMIT) axis** (above) — a clean per-row conversion-cost slope and a natural small/large pairing for the instruction-count-gate-vs-walltime split already in (d).
+- **Consider adopting: a peak-memory guard** for the O(rows) produce paths. bareduckdb tracks `getrusage` max RSS; codspeed walltime tracks neither memory nor allocations. A conversion regression is often memory-shaped (cf. the recorded fetchall +8% list->tuple edge-copy; the df() masked_array branch) — add a separate `getrusage`/memray delta assertion on `fetchall` and `df()`-with-nulls as a secondary signal, since a pure-timing gate can miss it.
+- **Do NOT adopt their anti-patterns:** no-warmup + single subprocess run charges one-time import-cache population into the measurement and yields no statistics — bad for steady-state binding isolation. Our warmup + codspeed repeated rounds are correct; keep them.
+- **Consistent with us:** their full-consume is eager `to_arrow_table()` and never `count(*)` — matches our discipline. Caveat: for their aggregate cases the arrow output is tiny, so the consume is trivial and the run is engine-only; our produce benchmarks must keep the materialization the heavy part (large output / top-N with large LIMIT).
diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py
new file mode 100644
index 00000000..e6fc43e0
--- /dev/null
+++ b/benchmarks/test_arrow_perf.py
@@ -0,0 +1,119 @@
+"""Standalone CodSpeed benchmark module for the Arrow read/write binding paths — NOT integrated
+(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-cutover
+  $M -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts=
+  $C -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts=
+
+DESIGN — the data must be FULLY MOVED, not lazily wrapped, or the benchmark measures nothing:
+  * READ (arrow -> duckdb): the duckdb ENGINE must scan every value. We aggregate over the actual
+    columns (sum/length), NOT count(*) -- count(*) is answered from arrow metadata without touching data.
+  * WRITE (duckdb -> arrow): the CONSUMER must materialize everything.
+      - to_arrow_table() / pl() are EAGER (the full table / polars DataFrame is built).
+      - to_arrow_reader() is LAZY -- duckdb only produces a batch when it is pulled -- so we iterate the
+        whole stream to actually exercise and consume the write path.
+
+pyarrow/polars are pinned to the SAME version in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import duckdb
+import pyarrow as pa
+import pytest
+
+N = 500_000
+WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)"
+WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)"
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+@pytest.fixture(scope="module")
+def arrow_numeric():
+    return pa.table(
+        {
+            "a": pa.array(range(N), type=pa.int64()),
+            "b": pa.array([i * 1.5 for i in range(N)], type=pa.float64()),
+        }
+    )
+
+
+@pytest.fixture(scope="module")
+def arrow_string():
+    return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())})
+
+
+@pytest.fixture(scope="module")
+def arrow_numeric_batches(arrow_numeric):
+    # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round
+    return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000)
+
+
+# --------------------------------------------------------------------------- #
+# READ: arrow -> duckdb. The engine must scan every value (sum/length force it).
+# --------------------------------------------------------------------------- #
+
+
+def test_read_arrow_numeric(benchmark, con, arrow_numeric):
+    con.register("t_num", arrow_numeric)
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall())
+
+
+def test_read_arrow_string(benchmark, con, arrow_string):
+    con.register("t_str", arrow_string)
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall())
+
+
+# ADDED: RecordBatchReader ingest -- the SAME PythonTableArrowArrayStreamFactory but STREAMING (distinct from
+# the materialized Table read above). A fresh reader is built per round (the engine drains it); sum() forces a
+# full scan of every value.
+
+
+def test_read_arrow_reader_numeric(benchmark, con, arrow_numeric_batches):
+    schema, batches = arrow_numeric_batches
+
+    def run():
+        reader = pa.RecordBatchReader.from_batches(schema, iter(batches))
+        con.register("t_rdr", reader)
+        return con.execute("SELECT sum(a), sum(b) FROM t_rdr").fetchall()
+
+    run()  # warm
+    benchmark(run)
+
+
+# --------------------------------------------------------------------------- #
+# WRITE: duckdb -> arrow, consumer fully materializes / fully drains the stream.
+# --------------------------------------------------------------------------- #
+
+
+def test_write_arrow_table_numeric(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table())
+
+
+def test_write_arrow_table_string(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table())
+
+
+def test_write_arrow_reader_consumed(benchmark, con):
+    def run():
+        reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000)
+        rows = 0
+        for batch in reader:  # drain the lazy stream so duckdb actually produces every batch
+            rows += batch.num_rows
+        return rows
+
+    benchmark(run)
+
+
+def test_write_polars_numeric(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_NUM).pl())
+
+
+def test_write_polars_string(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_STR).pl())
diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py
new file mode 100644
index 00000000..d4edda03
--- /dev/null
+++ b/benchmarks/test_cardinality_perf.py
@@ -0,0 +1,63 @@
+"""Standalone CodSpeed benchmark module: the RESULT-CARDINALITY (top-N) sweep — NOT integrated (not in
+pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  $M -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  $C -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+
+WHY THIS MODULE (adopted from iqmo-org/bareduckdb): hold the SOURCE fixed and sweep only the number of rows
+materialized to Python via ORDER BY ... LIMIT n for n in {100, 1k, 10k, 100k}, through fetchall / df /
+to_arrow_table. The engine cost (scan the fixed SRC + top-N heap) stays ~constant, so the walltime delta
+across n is dominated by the per-row binding conversion -> a clean per-row slope. The n=100 end is the
+noise-free overhead regime (the natural instruction-count-gate point); the n=100k end is throughput.
+
+A clean monotone slope (and ~parity slope between the two builds) is the signal we report; a build whose slope
+is steeper has a per-row conversion regression. Source held constant rules out scan-cost as the confound (a
+cleaner axis than varying range(), which also changes scan cost).
+
+numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import duckdb
+import pytest
+
+SRC = 200_000  # fixed source size -> constant engine scan + top-N across all n
+LIMITS = [100, 1_000, 10_000, 100_000]
+
+# 3 columns (BIGINT, DOUBLE, VARCHAR) so the per-row conversion is non-trivial; source is a fixed inline
+# subquery (no table state) and ORDER BY forces a full scan + top-N of the same SRC rows every time.
+_SRC_SUBQ = f"(SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC}) t(i))"
+
+
+def _query(n):
+    return f"SELECT a, b, s FROM {_SRC_SUBQ} ORDER BY a DESC LIMIT {n}"
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+@pytest.mark.parametrize("n", LIMITS)
+def test_limit_fetchall(benchmark, con, n):
+    q = _query(n)
+    con.execute(q).fetchall()  # warm
+    benchmark(lambda: con.execute(q).fetchall())
+
+
+@pytest.mark.parametrize("n", LIMITS)
+def test_limit_df(benchmark, con, n):
+    q = _query(n)
+    con.sql(q).df()  # warm
+    benchmark(lambda: con.sql(q).df())
+
+
+@pytest.mark.parametrize("n", LIMITS)
+def test_limit_to_arrow(benchmark, con, n):
+    q = _query(n)
+    con.sql(q).to_arrow_table()  # warm
+    benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py
new file mode 100644
index 00000000..8c8ef20a
--- /dev/null
+++ b/benchmarks/test_fetch_perf.py
@@ -0,0 +1,141 @@
+"""Standalone CodSpeed benchmark module — NOT integrated (not in pyproject, not in CI, not committed).
+
+Purpose: A/B the binding-layer perf between the two builds (pybind11 `main` vs nanobind cutover), in particular
+the narrow-column `fetchall` regression. Run the SAME file under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-cutover
+  $M -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts=
+  $C -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts=
+
+NOTE: macOS arm64 has no Valgrind, so only `--codspeed-mode=walltime` works locally (wall-clock stats). The
+deterministic instruction-count mode (`--codspeed-mode=simulation`) needs Linux + the CodSpeed instrument
+(CI, or `codspeed run` in a Linux container). In CI/cloud, CodSpeed compares each run against a git baseline;
+locally we get the same benchmark workflow but A/B by running the file under the two interpreters by hand.
+"""
+
+import duckdb
+import pytest
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+def _bench_fetchall(benchmark, con, query):
+    con.execute(query).fetchall()  # warm the engine before measuring
+    benchmark(lambda: con.execute(query).fetchall())
+
+
+def test_fetchall_int(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(200000) t(i)")
+
+
+def test_fetchall_smallint(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT (i % 100)::INTEGER AS a FROM range(200000) t(i)")
+
+
+def test_fetchall_double(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT (i * 1.5)::DOUBLE AS a FROM range(200000) t(i)")
+
+
+def test_fetchall_2int(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(200000) t(i)")
+
+
+def test_fetchall_str(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT ('str_value_' || i) AS s FROM range(100000) t(i)")
+
+
+def test_fetchall_mixed(benchmark, con):
+    query = (
+        "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, "
+        "{'a': i, 'b': i + 1} AS st FROM range(50000) t(i)"
+    )
+    _bench_fetchall(benchmark, con, query)
+
+
+def test_fetchone_iter(benchmark, con):
+    query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)"
+
+    def run():
+        rel = con.execute(query)
+        while rel.fetchone() is not None:
+            pass
+
+    benchmark(run)
+
+
+# --------------------------------------------------------------------------- #
+# ADDED: small-N instruction-count-gate variants (the narrow-numeric fixed-cost path, noise-free at range(2048)
+# under simulation mode in CI), expensive scalar OUT-row types (timestamptz pytz-per-row, blob, null-heavy), a
+# heterogeneous per-cell-dispatch row (hugeint+uuid+decimal128+varchar, distinct from homogeneous columns), and
+# the batched fetchmany loop.
+# --------------------------------------------------------------------------- #
+
+
+def test_fetchall_int_gate(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)")
+
+
+def test_fetchall_2int_gate(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)")
+
+
+def test_fetchall_null_heavy(benchmark, con):
+    _bench_fetchall(
+        benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)"
+    )
+
+
+def test_fetchall_timestamptz(benchmark, con):
+    _bench_fetchall(
+        benchmark, con, "SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range(100000) t(i)"
+    )
+
+
+def test_fetchall_decimal128(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range(200000) t(i)")
+
+
+def test_fetchall_blob(benchmark, con):
+    _bench_fetchall(benchmark, con, "SELECT ('blob_value_' || i)::BLOB FROM range(100000) t(i)")
+
+
+def test_fetchall_mixed_wide(benchmark, con):
+    # heterogeneous row -> per-cell type dispatch in the Fetchone column loop (distinct branch/cache profile
+    # from the homogeneous single-type columns above)
+    query = (
+        "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, "
+        "((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range(100000) t(i)"
+    )
+    _bench_fetchall(benchmark, con, query)
+
+
+def test_fetchmany_batched(benchmark, con):
+    query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)"
+
+    def run():
+        rel = con.execute(query)
+        while True:
+            rows = rel.fetchmany(10_000)
+            if not rows:
+                break
+
+    benchmark(run)
+
+
+def test_expr_many(benchmark):
+    def run():
+        out = []
+        for i in range(2000):
+            col = duckdb.ColumnExpression(f"col_{i}")
+            const = duckdb.ConstantExpression(i)
+            out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}"))
+        return len(out)
+
+    benchmark(run)
diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py
new file mode 100644
index 00000000..4fca641a
--- /dev/null
+++ b/benchmarks/test_ingest_native_perf.py
@@ -0,0 +1,93 @@
+"""Standalone CodSpeed benchmark module for NATIVE Python-object ingest (Python list/tuple/dict -> duckdb) —
+NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  $M -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  $C -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+
+WHY THIS MODULE: native Python-object ingest had ZERO coverage. Every cell goes through TransformPythonValue
+and the GetPythonObjectType ladder (python_conversion.cpp); dicts recurse through TransformDictionaryToStruct;
+executemany re-binds a parameter set per row (pyconnection.cpp ExecuteMany loop).
+
+FULL MATERIALIZE: executemany lands N rows in a real table (CREATE OR REPLACE each round so the table does not
+grow across codspeed's repeated invocations). values() builds the value vectors EAGERLY inside the call
+(TransformPythonParamList), and we drain the resulting relation with fetchall so the round-trip is complete.
+
+NOTE on values() shape: a single list argument to values() becomes ONE row whose COLUMNS are the list items
+(see DuckDBPyConnection::Values, pyconnection.cpp) -- so a list of N scalars is 1 row x N columns and runs
+TransformPythonValue N times; a list of N tuples is 1 row x N nested(LIST) columns; a list of N dicts is
+1 row x N STRUCT columns (TransformDictionaryToStruct). All three exercise the per-cell transform N times.
+"""
+
+import duckdb
+import pytest
+
+EXECMANY_N = 20_000  # executemany re-binds + executes per row, keep moderate
+WIDE_N = 10_000  # values() builds a 1-row x N-col relation; cap N so the binder stays sane
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+@pytest.fixture(scope="module")
+def rows_3col():
+    return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)]
+
+
+@pytest.fixture(scope="module")
+def scalars_wide():
+    return [i for i in range(WIDE_N)]
+
+
+@pytest.fixture(scope="module")
+def tuples_wide():
+    return [(i, i + 1, i + 2) for i in range(WIDE_N)]
+
+
+@pytest.fixture(scope="module")
+def dicts_wide():
+    return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)]
+
+
+# --------------------------------------------------------------------------- #
+# executemany: bind + execute one parameter set per row, into a real table.
+# --------------------------------------------------------------------------- #
+
+
+def test_ingest_executemany_3col(benchmark, con, rows_3col):
+    con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
+    con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)  # warm
+
+    def run():
+        con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
+        con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)
+
+    benchmark(run)
+
+
+# --------------------------------------------------------------------------- #
+# values(): EAGER per-cell TransformPythonValue. Drain with fetchall to complete the round-trip.
+# --------------------------------------------------------------------------- #
+
+
+def test_ingest_values_scalars(benchmark, con, scalars_wide):
+    con.values(scalars_wide).fetchall()  # warm
+    benchmark(lambda: con.values(scalars_wide).fetchall())
+
+
+def test_ingest_values_tuples(benchmark, con, tuples_wide):
+    # each tuple cell -> LIST value (TransformPythonValue recursion)
+    con.values(tuples_wide).fetchall()  # warm
+    benchmark(lambda: con.values(tuples_wide).fetchall())
+
+
+def test_ingest_values_dicts(benchmark, con, dicts_wide):
+    # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion)
+    con.values(dicts_wide).fetchall()  # warm
+    benchmark(lambda: con.values(dicts_wide).fetchall())
diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py
new file mode 100644
index 00000000..bb5fc1e8
--- /dev/null
+++ b/benchmarks/test_ingest_numpy_perf.py
@@ -0,0 +1,109 @@
+"""Standalone CodSpeed benchmark module for the NUMPY ingest paths (numpy / numpy-backed pandas -> duckdb)
+— NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  $M -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  $C -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+
+WHY THIS MODULE: the numpy scan (NumpyScan / NumpyArray facade / RawArrayWrapper / pandas-bind / analyzer) is
+the IN-numpy half the nanobind cutover reworked, and several of its branches were untested:
+  * I0-2 object-string scan: the per-row isinstance + PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode
+    transcode ladder (numpy_scan.cpp). GOTCHA (encoded): a meaningful benchmark MUST mix ASCII + non-ASCII +
+    a null sentinel -- ASCII-only misses the transcode + null-detection ladder entirely.
+  * I0-1 double NaN->NULL loop (numpy_scan.cpp) -- the reworked float path.
+  * NULL-heavy masked scan: ScanNumpyMasked + ApplyMask (pandas nullable Int64).
+  * I1-3 analyzer bind: PandasAnalyzer::Analyze samples rows through the GetItemType ladder. This is a per-BIND
+    cost, independent of row count, so it is the ONE place count(*) is the correct consume (the cost is at bind,
+    not scan); every other READ here aggregates over real columns (sum/length) to force a full engine scan.
+  * I1-8 numpy ndarray / dict-of-arrays via the replacement scan (resolved from a module global).
+
+numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import duckdb
+import numpy as np
+import pandas as pd
+import pytest
+
+N = 500_000
+ANALYZER_N = 200_000
+
+# Module-global for the replacement-scan-from-variable path (frame resolution finds f_globals reliably).
+NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}
+
+# Mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only).
+_MIXED = ["ascii_value_", "café_", "naïve_", "日本語_", None]
+_MIXED_STRINGS = [None if _MIXED[i % 5] is None else f"{_MIXED[i % 5]}{i}" for i in range(N)]
+
+# Mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind.
+_MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)]
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+@pytest.fixture(scope="module")
+def df_double_with_nan():
+    a = np.arange(N, dtype="float64") * 1.5
+    a[::10] = np.nan  # real NaNs -> NaN->NULL conversion loop
+    return pd.DataFrame({"a": a})
+
+
+@pytest.fixture(scope="module")
+def df_object_string_mixed():
+    return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)})
+
+
+@pytest.fixture(scope="module")
+def df_masked_int():
+    # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask
+    arr = pd.array(np.arange(N), dtype="Int64")
+    arr[::10] = pd.NA
+    return pd.DataFrame({"a": arr})
+
+
+@pytest.fixture(scope="module")
+def df_object_mixed_types():
+    return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)})
+
+
+# --------------------------------------------------------------------------- #
+# READ: numpy -> duckdb. Engine scans every value (sum/length force it).
+# --------------------------------------------------------------------------- #
+
+
+def test_read_numpy_dict_numeric(benchmark, con):
+    benchmark(lambda: con.sql("SELECT sum(a), sum(b) FROM NPDICT").fetchall())
+
+
+def test_read_numpy_double_with_nan(benchmark, con, df_double_with_nan):
+    con.register("t", df_double_with_nan)
+    benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
+
+
+def test_read_numpy_masked_int(benchmark, con, df_masked_int):
+    con.register("t", df_masked_int)
+    benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
+
+
+def test_read_numpy_object_string_mixed(benchmark, con, df_object_string_mixed):
+    con.register("t", df_object_string_mixed)
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
+
+
+# --------------------------------------------------------------------------- #
+# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY -- the cost is at bind, not scan, so we
+# must NOT force a scan (that would drown the per-bind analyzer signal). Re-binds the object column each call.
+# --------------------------------------------------------------------------- #
+
+
+def test_bind_analyzer_object(benchmark, con, df_object_mixed_types):
+    con.register("t", df_object_mixed_types)
+    con.execute("SELECT count(*) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall())
diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py
new file mode 100644
index 00000000..34a0948d
--- /dev/null
+++ b/benchmarks/test_pandas_perf.py
@@ -0,0 +1,133 @@
+"""Standalone CodSpeed benchmark module for the pandas read/write binding paths, comparing NUMPY-backed vs
+ARROW-backed DataFrames — NOT integrated (not in pyproject, not in CI, not committed). Run under each build:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-cutover
+  $M -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts=
+  $C -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts=
+
+WHY BOTH BACKINGS: when duckdb scans a pandas DataFrame, the binding path depends on each column's backing:
+  * numpy-backed columns (dtype int64 / float64 / object) -> the NUMPY scan path (NumpyArray facade,
+    RawArrayWrapper, pandas/bind.cpp, analyzer.cpp) -- this is the path the nanobind cutover reworked
+    NON-TRIVIALLY, so it gets first-class coverage here.
+  * arrow-backed columns (pandas ArrowDtype, e.g. int64[pyarrow]) -> the ARROW scan path (near zero-copy).
+On the WRITE side, duckdb's native pandas output (rel.df()) is NUMPY-backed; an arrow-backed pandas frame is
+produced via duckdb-arrow + pyarrow.to_pandas(ArrowDtype) (pyarrow.to_pandas is identical on both builds, so
+the A/B delta is still the duckdb binding).
+
+FULL CONSUME (same discipline as the arrow module): READ aggregates over the actual columns (sum/length, NOT
+count(*) which is answered from metadata), and WRITE materializes the entire DataFrame.
+
+numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import duckdb
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pytest
+
+N = 500_000
+WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)"
+WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)"
+_STRINGS = [f"str_value_{i}" for i in range(N)]
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+@pytest.fixture(scope="module")
+def df_numpy_numeric():
+    return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5})
+
+
+@pytest.fixture(scope="module")
+def df_numpy_string():
+    # explicit object dtype -> classic numpy-backed object-string column (the reworked object/analyzer path)
+    return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)})
+
+
+@pytest.fixture(scope="module")
+def df_arrow_numeric():
+    return pd.DataFrame(
+        {
+            "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())),
+            "b": pd.array(np.arange(N) * 1.5, dtype=pd.ArrowDtype(pa.float64())),
+        }
+    )
+
+
+@pytest.fixture(scope="module")
+def df_arrow_string():
+    return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))})
+
+
+# --------------------------------------------------------------------------- #
+# READ: pandas -> duckdb. Engine scans every value (sum/length force it).
+# --------------------------------------------------------------------------- #
+
+
+def test_read_pandas_numpy_numeric(benchmark, con, df_numpy_numeric):
+    con.register("t", df_numpy_numeric)
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
+
+
+def test_read_pandas_numpy_string(benchmark, con, df_numpy_string):
+    con.register("t", df_numpy_string)
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
+
+
+def test_read_pandas_arrow_numeric(benchmark, con, df_arrow_numeric):
+    con.register("t", df_arrow_numeric)
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
+
+
+def test_read_pandas_arrow_string(benchmark, con, df_arrow_string):
+    con.register("t", df_arrow_string)
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
+
+
+# --------------------------------------------------------------------------- #
+# WRITE: duckdb -> pandas. df() is NUMPY-backed (the reworked production path);
+# the arrow-backed frame goes via duckdb-arrow + pyarrow.to_pandas(ArrowDtype).
+# Both eagerly materialize the whole DataFrame.
+# --------------------------------------------------------------------------- #
+
+
+def test_write_pandas_numpy_numeric(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_NUM).df())
+
+
+def test_write_pandas_numpy_string(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_STR).df())
+
+
+# ADDED: the numpy-backed df() WRITE with REAL nulls -> the masked_array build + masked->pd.NA rewrite that the
+# cutover reworked (a no-null column takes the cheap std::move path and would measure the wrong thing), plus a
+# datetime column (TimestampConvert + ConvertDateTimeTypes).
+
+
+def test_write_pandas_numpy_numeric_with_nulls(benchmark, con):
+    q = (
+        "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
+        "CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range(500000) t(i)"
+    )
+    benchmark(lambda: con.sql(q).df())
+
+
+def test_write_pandas_numpy_timestamp(benchmark, con):
+    q = "SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range(500000) t(i)"
+    benchmark(lambda: con.sql(q).df())
+
+
+def test_write_pandas_arrow_numeric(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
+
+
+def test_write_pandas_arrow_string(benchmark, con):
+    benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py
new file mode 100644
index 00000000..cfe5c281
--- /dev/null
+++ b/benchmarks/test_produce_numpy_perf.py
@@ -0,0 +1,181 @@
+"""Standalone CodSpeed benchmark module for the COLUMNAR produce paths (duckdb -> numpy/pandas), i.e. df(),
+fetchnumpy(), fetch_df_chunk() — NOT integrated (not in pyproject, not in CI, not committed). Run under each
+build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  $M -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  $C -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+
+WHY THIS MODULE: the columnar OUT path (FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular) is exactly
+what the nanobind cutover reworked. The under-covered cases are: (1) the WITH-NULLS branch (HAS_NULLS=true ->
+masked_array build -> masked->pd.NA rewrite, array_wrapper.cpp / pyresult.cpp) -- NEVER previously benchmarked
+and the most-changed code; (2) datetime; (3) fetchnumpy without the DataFrame wrap; (4) fetch_df_chunk; and
+the wide-internal types HUGEINT (->double cast), UUID (UUIDConvert), DECIMAL(28,x) (ConvertDecimalInternal
+<hugeint_t>) that exercise distinct OUT-col converters.
+
+GOTCHA (encoded below): OUT-col NULL benchmarks use REAL DuckDB nulls (CASE WHEN .. THEN NULL). A no-null
+column silently takes the cheap std::move path and the masked-array branch never triggers, so it would measure
+the wrong thing.
+
+FULL CONSUME: df() / fetchnumpy() eagerly materialize the whole column set; fetch_df_chunk is drained in a loop.
+
+numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import gc
+import sys
+import tracemalloc
+
+import duckdb
+import numpy as np  # noqa: F401  (pinned identically A/B; imported so the env matches the other modules)
+import pytest
+
+N = 500_000
+TYPE_N = 200_000  # wide-internal types (hugeint/uuid/decimal128) are heavier per cell
+
+Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
+Q_NUM_NULLS = (
+    "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
+    f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)"
+)
+Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
+Q_TS = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)"
+Q_HUGEINT = f"SELECT (i::HUGEINT * 1000000000000) AS h FROM range({TYPE_N}) t(i)"
+Q_UUID = f"SELECT gen_random_uuid() AS u FROM range({TYPE_N}) t(i)"
+Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)"
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+def _bench_df(benchmark, con, query):
+    con.sql(query).df()  # warm
+    benchmark(lambda: con.sql(query).df())
+
+
+def _bench_numpy(benchmark, con, query):
+    con.sql(query).fetchnumpy()  # warm
+    benchmark(lambda: con.sql(query).fetchnumpy())
+
+
+# --------------------------------------------------------------------------- #
+# df(): the production NUMPY-backed columnar path. no-null vs REAL-null vs string vs timestamp.
+# --------------------------------------------------------------------------- #
+
+
+def test_df_numeric(benchmark, con):
+    _bench_df(benchmark, con, Q_NUM)
+
+
+def test_df_numeric_with_nulls(benchmark, con):
+    # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch)
+    _bench_df(benchmark, con, Q_NUM_NULLS)
+
+
+def test_df_string(benchmark, con):
+    _bench_df(benchmark, con, Q_STR)
+
+
+def test_df_timestamp(benchmark, con):
+    _bench_df(benchmark, con, Q_TS)
+
+
+def test_df_hugeint(benchmark, con):
+    _bench_df(benchmark, con, Q_HUGEINT)
+
+
+def test_df_uuid(benchmark, con):
+    _bench_df(benchmark, con, Q_UUID)
+
+
+def test_df_decimal128(benchmark, con):
+    _bench_df(benchmark, con, Q_DEC128)
+
+
+# --------------------------------------------------------------------------- #
+# fetchnumpy(): same FetchNumpyInternal without the DataFrame wrap.
+# --------------------------------------------------------------------------- #
+
+
+def test_fetchnumpy_numeric(benchmark, con):
+    _bench_numpy(benchmark, con, Q_NUM)
+
+
+def test_fetchnumpy_numeric_with_nulls(benchmark, con):
+    _bench_numpy(benchmark, con, Q_NUM_NULLS)
+
+
+# --------------------------------------------------------------------------- #
+# fetch_df_chunk(): per-chunk DataFrame production, drained in a loop.
+# --------------------------------------------------------------------------- #
+
+
+def test_fetch_df_chunk_loop(benchmark, con):
+    def run():
+        rel = con.sql(Q_NUM)
+        rows = 0
+        while True:
+            chunk = rel.fetch_df_chunk()
+            if len(chunk) == 0:
+                break
+            rows += len(chunk)
+        return rows
+
+    con.sql(Q_NUM).fetch_df_chunk()  # warm
+    benchmark(run)
+
+
+# --------------------------------------------------------------------------- #
+# torch(): FetchNumpyInternal + per-column from_numpy. SKIPPED cleanly if torch is absent (identical A/B).
+# --------------------------------------------------------------------------- #
+
+
+def test_torch_numeric(benchmark, con):
+    pytest.importorskip("torch")
+    q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)"
+    con.sql(q).torch()  # warm
+    benchmark(lambda: con.sql(q).torch())
+
+
+# --------------------------------------------------------------------------- #
+# MEMORY GUARD (secondary signal, not a codspeed benchmark). codspeed walltime tracks neither memory nor
+# allocations, and conversion regressions are often memory-shaped (the recorded fetchall list->tuple edge-copy;
+# the df() masked_array branch). We use tracemalloc to capture the PEAK Python-tracked allocation of ONE
+# df()-with-nulls call. Correctness notes:
+#   * reset_peak() is called AFTER the warm (and after freeing the warm result) so the warm does not establish
+#     a high-water mark that swallows the measured call -- the prior getrusage(ru_maxrss) version was broken
+#     precisely because ru_maxrss is monotonic and the warm pre-set the peak, making the delta ~0.
+#   * tracemalloc reports BYTES on every platform (no macOS-bytes / Linux-KiB skew that the getrusage version
+#     had), so the ceiling is portable to the Linux CI target.
+# CAVEAT: tracemalloc only sees Python-level allocations; the raw numpy column buffers are allocated in C and
+# are NOT visible here. So this catches a gross PYTHON-object-shaped blowup (the masked->pd.NA rewrite / a
+# per-row object materialization regression) but is not a total-RSS gate -- the authoritative CI gate for the
+# C-buffer payload is codspeed memory mode (--codspeed-mode=memory).
+# --------------------------------------------------------------------------- #
+
+
+def test_mem_df_with_nulls():
+    con = duckdb.connect()
+    try:
+        tracemalloc.start()
+        warm = con.sql(Q_NUM_NULLS).df()  # populate one-time import / type caches
+        del warm
+        gc.collect()
+        tracemalloc.reset_peak()  # discount the warm's transient peak BEFORE the measured call
+        out = con.sql(Q_NUM_NULLS).df()
+        _current, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+        del out
+    finally:
+        con.close()
+    print(f"\n[mem] df()-with-nulls tracemalloc peak = {peak / 1e6:.1f} MB", file=sys.stderr)
+    # Python-tracked allocations for a 500k x 2-col masked df are a few MB; a gross conversion-memory blowup
+    # (e.g. a per-row Python object list, the masked->pd.NA rewrite gone wrong) is tens+ MB. 100 MB ceiling
+    # catches that without flaking, and is bytes on all platforms.
+    assert peak < 100_000_000
diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py
new file mode 100644
index 00000000..3e92f12d
--- /dev/null
+++ b/benchmarks/test_types_roundtrip_perf.py
@@ -0,0 +1,78 @@
+"""Standalone CodSpeed benchmark module: the TYPE x DIRECTION produce matrix — NOT integrated (not in
+pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  $M -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  $C -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+
+WHY THIS MODULE: a single systematic sweep of one logical type per column across the three produce directions
+  * OUT-row   = fetchall()          -> FromValue per cell (python_objects.cpp)
+  * OUT-col   = df()                -> ArrayWrapper / ConvertColumn (array_wrapper.cpp)
+  * OUT-arrow = to_arrow_table()    -> arrow export converters
+so a regression localizes to (type, direction). Includes the iqmo/bareduckdb cross-check breadth that the
+narrow-numeric homogeneous benchmarks miss: HUGEINT (PyLong_FromString / hugeint->double / int128 export),
+UUID (uuid.UUID per row / UUIDConvert), DECIMAL(28,6) int128-internal (ConvertDecimalInternal<hugeint_t>),
+and a long-varchar (>64 chars) that shifts the string paths from overhead-bound to copy-bound.
+
+FULL CONSUME: fetchall and df materialize everything; to_arrow_table is eager. NOTE: to_arrow_table on a
+materialized result re-runs the query with the GIL released (PromoteMaterializedToArrow), so the OUT-arrow
+column is engine-parallel and walltime-NOISY -- treat it as informational, not a hard gate.
+
+numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import duckdb
+import pytest
+
+N = 100_000
+
+# one logical type per column; long-varchar is intentionally > 64 chars
+TYPE_EXPR = {
+    "int64": "i::BIGINT",
+    "double": "(i * 1.5)::DOUBLE",
+    "varchar_short": "('str_' || i)",
+    "varchar_long": "('row_' || i || '_' || repeat('payload ', 9))",
+    "timestamp": "TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND)",
+    "decimal64": "((i::DECIMAL(18, 3)) / 1000)",
+    "decimal128": "((i * 1.5)::DECIMAL(28, 6))",
+    "hugeint": "(i::HUGEINT * 1000000000000)",
+    "uuid": "gen_random_uuid()",
+    "struct": "{'a': i, 'b': i + 1}",
+    "list": "[i, i + 1, i + 2]",
+}
+TYPES = list(TYPE_EXPR)
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+def _query(type_name):
+    return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)"
+
+
+@pytest.mark.parametrize("type_name", TYPES)
+def test_out_row_fetchall(benchmark, con, type_name):
+    q = _query(type_name)
+    con.execute(q).fetchall()  # warm
+    benchmark(lambda: con.execute(q).fetchall())
+
+
+@pytest.mark.parametrize("type_name", TYPES)
+def test_out_col_df(benchmark, con, type_name):
+    q = _query(type_name)
+    con.sql(q).df()  # warm
+    benchmark(lambda: con.sql(q).df())
+
+
+@pytest.mark.parametrize("type_name", TYPES)
+def test_out_arrow_table(benchmark, con, type_name):
+    # informational only: PromoteMaterializedToArrow re-runs the query with the GIL released (noisy)
+    q = _query(type_name)
+    con.sql(q).to_arrow_table()  # warm
+    benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py
new file mode 100644
index 00000000..ef398ebb
--- /dev/null
+++ b/benchmarks/test_udf_perf.py
@@ -0,0 +1,110 @@
+"""Standalone CodSpeed benchmark module for the Python UDF binding paths (src/python_udf.cpp) — NOT integrated
+(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+
+  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
+  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  $M -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  $C -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+
+WHY THIS MODULE: the whole UDF subsystem had ZERO benchmark coverage. The NATIVE scalar UDF is the single
+biggest untested per-call-overhead path in the binding -- per row it builds a TupleBuilder of args, calls
+PyObject_CallObject, and runs TransformPythonObject on the result (python_udf.cpp). The ARROW (vectorized) UDF
+is the columnar counterpart: ConvertDataChunkToPyArrowTable + the Python call + ConvertArrowTableToVector cast.
+
+FULL CONSUME (same discipline as the other modules): every UDF benchmark wraps the call in a sum()/length()
+aggregate so the ENGINE evaluates the UDF on every row (count(*) would skip it). The aggregate output is a
+single row, so the measured cost is the per-row (native) / per-chunk (arrow) UDF invocation, not the fetch.
+
+numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""
+
+import duckdb
+import pytest
+from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR
+
+pa = pytest.importorskip("pyarrow")
+pc = pytest.importorskip("pyarrow.compute")
+
+NATIVE_N = 200_000  # native = one Python call per row, keep moderate
+ARROW_N = 1_000_000  # arrow = one Python call per chunk (vectorized), can be large
+
+
+@pytest.fixture
+def con():
+    c = duckdb.connect()
+    yield c
+    c.close()
+
+
+def _bench(benchmark, con, query):
+    con.execute(query).fetchall()  # warm the engine + import caches before measuring
+    benchmark(lambda: con.execute(query).fetchall())
+
+
+# --------------------------------------------------------------------------- #
+# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result).
+# --------------------------------------------------------------------------- #
+
+
+def test_udf_native_int_1arg(benchmark, con):
+    con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
+    _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)")
+
+
+def test_udf_native_int_2arg(benchmark, con):
+    con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT)
+    _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)")
+
+
+def test_udf_native_double_1arg(benchmark, con):
+    con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE)
+    _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)")
+
+
+def test_udf_native_string(benchmark, con):
+    con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR)
+    _bench(
+        benchmark,
+        con,
+        f"SELECT sum(length(up(s))) FROM (SELECT ('str_value_' || i) AS s FROM range({NATIVE_N}) t(i))",
+    )
+
+
+def test_udf_native_null_inputs(benchmark, con):
+    # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the
+    # validity short-circuit, not the Python call, so the UDF only ever sees non-NULL rows.
+    con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
+    _bench(
+        benchmark,
+        con,
+        "SELECT sum(add_one(v)) FROM "
+        f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({NATIVE_N}) t(i))",
+    )
+
+
+# --------------------------------------------------------------------------- #
+# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast.
+# --------------------------------------------------------------------------- #
+
+
+def test_udf_arrow_int(benchmark, con):
+    con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+    _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)")
+
+
+def test_udf_arrow_double(benchmark, con):
+    con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow")
+    _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)")
+
+
+def test_udf_arrow_null_inputs(benchmark, con):
+    # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call
+    # and reconstructs the result vector afterwards -- this is the selvec compaction/reconstruction cost.
+    con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+    _bench(
+        benchmark,
+        con,
+        "SELECT sum(arrow_add_one(v)) FROM "
+        f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({ARROW_N}) t(i))",
+    )

From 177d99b2f9a4593a42845904a9632f3419555128 Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Wed, 1 Jul 2026 08:04:26 +0200
Subject: [PATCH 2/7] Redesign cardinality benchmark + add tokenless CodSpeed
 CI

Cardinality: the ORDER BY ... LIMIT n sweep let the engine top-N sort dominate
and swamp the per-row conversion signal (numbers were non-monotone). Replace it
with a pre-materialized fixed source + plain LIMIT n (no sort): the scan
early-stops at n rows, so rows-to-Python conversion is the dominant n-varying
cost and the slope is monotone; the A/B delta at each n isolates the binding.

CI: .github/workflows/codspeed.yml runs the suite under CodSpeed simulation
(instruction-count) mode on Linux, tokenless (no dashboard upload; enable the
hosted gate later via a CodSpeed project + OIDC/token). Instruction counts are
deterministic for every benchmark, so no gated/informational split is needed.
Not yet run in CI; the build steps mirror the dev build and need a shakeout.
---
 .github/workflows/codspeed.yml      | 74 +++++++++++++++++++++++++++++
 benchmarks/test_cardinality_perf.py | 49 ++++++++++---------
 2 files changed, 100 insertions(+), 23 deletions(-)
 create mode 100644 .github/workflows/codspeed.yml

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
new file mode 100644
index 00000000..fcd6b7ca
--- /dev/null
+++ b/.github/workflows/codspeed.yml
@@ -0,0 +1,74 @@
+# Performance-regression benchmarks via CodSpeed, in deterministic instruction-count (simulation) mode.
+#
+# TOKENLESS FOR NOW: the CodSpeed action's token is only needed to UPLOAD results to the CodSpeed
+# dashboard (the hosted PR-gate). Without it the action still RUNS every benchmark under Valgrind and
+# reports the instruction counts in the job log. To turn on the hosted regression gate later: create a
+# CodSpeed project for the repo and either (public repo) rely on the OIDC `id-token: write` permission
+# below, or add a `CODSPEED_TOKEN` repo secret and pass `token: ${{ secrets.CODSPEED_TOKEN }}` to the action.
+#
+# Why simulation (instruction-count) and not walltime: instruction counts are deterministic even for the
+# multi-threaded / engine-heavy paths (Valgrind serializes and counts), so the whole suite is gate-able and
+# there is no need to split "gated vs informational" the way noisy local walltime required. Instruction count
+# is exactly the signal that would have caught the LIST/ARRAY df() regression cleanly.
+#
+# NOTE: this workflow has not been run in CI yet; the build steps mirror the documented dev build (CLAUDE.md)
+# and will likely need a shakeout run. Valgrind is slow (~20-50x); if the full suite is too slow, trim the
+# largest-N benchmarks or run a curated subset.
+
+name: Benchmarks
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+  workflow_dispatch:
+
+concurrency:
+  group: codspeed-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  benchmarks:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write # enables tokenless (OIDC) upload once a CodSpeed project is linked; harmless otherwise
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive # the DuckDB engine submodule is needed to build
+          fetch-depth: 0 # setuptools_scm needs history for version detection
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.13"
+
+      - name: Cache sccache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/sccache
+          key: sccache-codspeed-${{ hashFiles('external/duckdb') }}
+          restore-keys: sccache-codspeed-
+
+      - name: Install sccache
+        run: |
+          curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \
+            | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache
+
+      - name: Build the extension (release) + benchmark deps
+        env:
+          CMAKE_C_COMPILER_LAUNCHER: sccache
+          CMAKE_CXX_COMPILER_LAUNCHER: sccache
+        run: |
+          uv sync --only-group build --no-install-project -p 3.13
+          uv sync --no-build-isolation --no-editable --reinstall -p 3.13
+          # benchmark deps: keep these pinned in lockstep with any baseline you compare against, so the only
+          # cross-run delta is the binding (numpy/pandas/pyarrow/polars/pytz + the codspeed plugin).
+          uv pip install pytest pytest-codspeed numpy pandas pyarrow polars pytz
+
+      - name: Run benchmarks (instruction-count)
+        uses: CodSpeedHQ/action@v4
+        with:
+          mode: simulation
+          run: uv run pytest benchmarks/ --codspeed -o addopts= -p no:cacheprovider
diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py
index d4edda03..3fe4ee0d 100644
--- a/benchmarks/test_cardinality_perf.py
+++ b/benchmarks/test_cardinality_perf.py
@@ -1,5 +1,4 @@
-"""Standalone CodSpeed benchmark module: the RESULT-CARDINALITY (top-N) sweep — NOT integrated (not in
-pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+"""Standalone CodSpeed benchmark: the RESULT-CARDINALITY (rows-to-Python) sweep. Run under each build:
 
   M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
   C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
@@ -7,41 +6,45 @@
   $M -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
   $C -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
 
-WHY THIS MODULE (adopted from iqmo-org/bareduckdb): hold the SOURCE fixed and sweep only the number of rows
-materialized to Python via ORDER BY ... LIMIT n for n in {100, 1k, 10k, 100k}, through fetchall / df /
-to_arrow_table. The engine cost (scan the fixed SRC + top-N heap) stays ~constant, so the walltime delta
-across n is dominated by the per-row binding conversion -> a clean per-row slope. The n=100 end is the
-noise-free overhead regime (the natural instruction-count-gate point); the n=100k end is throughput.
-
-A clean monotone slope (and ~parity slope between the two builds) is the signal we report; a build whose slope
-is steeper has a per-row conversion regression. Source held constant rules out scan-cost as the confound (a
-cleaner axis than varying range(), which also changes scan cost).
-
-numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+REDESIGN NOTE: the first version swept `ORDER BY a DESC LIMIT n` over a fixed source. That was wrong:
+the engine's full top-N SORT (~3-14ms, itself variable) dominated and swamped the per-row conversion
+signal, and the numbers came out non-monotone. This version pre-materializes the fixed source table ONCE
+and sweeps `SELECT * FROM src LIMIT n` with NO ORDER BY: a plain LIMIT early-stops the scan at n rows, so
+the engine cost is cheap and monotone in n, and the rows-to-Python CONVERSION is the dominant n-varying
+cost. That gives a clean, monotone per-row slope; the A/B delta at each n isolates the binding, and a build
+whose slope is steeper has a per-row conversion regression. n=100 is the overhead regime (the natural
+instruction-count-gate point); n=100_000 is throughput.
+
+3 columns (BIGINT, DOUBLE, VARCHAR) so per-row conversion is non-trivial. numpy/pandas/pyarrow are pinned to
+the SAME versions in both .venv-release, so the A/B delta is purely the binding.
 """
 
 import duckdb
 import pytest
 
-SRC = 200_000  # fixed source size -> constant engine scan + top-N across all n
+SRC_ROWS = 200_000
 LIMITS = [100, 1_000, 10_000, 100_000]
 
-# 3 columns (BIGINT, DOUBLE, VARCHAR) so the per-row conversion is non-trivial; source is a fixed inline
-# subquery (no table state) and ORDER BY forces a full scan + top-N of the same SRC rows every time.
-_SRC_SUBQ = f"(SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC}) t(i))"
-
-
-def _query(n):
-    return f"SELECT a, b, s FROM {_SRC_SUBQ} ORDER BY a DESC LIMIT {n}"
 
-
-@pytest.fixture
+@pytest.fixture(scope="module")
 def con():
+    # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be
+    # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows.
     c = duckdb.connect()
+    c.execute(
+        "CREATE TABLE src AS "
+        f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)"
+    )
     yield c
     c.close()
 
 
+def _query(n):
+    # No ORDER BY: a plain LIMIT early-stops the scan at n rows -> engine cost cheap and monotone in n, so the
+    # per-row binding conversion dominates the n-varying signal (unlike the old ORDER BY top-N sort).
+    return f"SELECT a, b, s FROM src LIMIT {n}"
+
+
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_fetchall(benchmark, con, n):
     q = _query(n)

From 4f24ed7371e202fed509c741041c9f77eb4aba6c Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Wed, 1 Jul 2026 13:01:05 +0200
Subject: [PATCH 3/7] fix benchmarks and add workflow

---
 .github/workflows/codspeed.yml          |  21 ++---
 benchmarks/test_arrow_perf.py           |  88 +++++++++++++--------
 benchmarks/test_cardinality_perf.py     |  54 +++++++------
 benchmarks/test_fetch_perf.py           |  98 ++++++++++++++---------
 benchmarks/test_ingest_native_perf.py   |  76 +++++++++++-------
 benchmarks/test_ingest_numpy_perf.py    |  81 +++++++++++--------
 benchmarks/test_pandas_perf.py          | 101 +++++++++++++++---------
 benchmarks/test_produce_numpy_perf.py   |  89 ++++++++++++---------
 benchmarks/test_types_roundtrip_perf.py |  58 +++++++-------
 benchmarks/test_udf_perf.py             |  63 +++++++++------
 pyproject.toml                          |   1 +
 11 files changed, 431 insertions(+), 299 deletions(-)

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index fcd6b7ca..c82465d2 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -1,19 +1,12 @@
-# Performance-regression benchmarks via CodSpeed, in deterministic instruction-count (simulation) mode.
+# Perf-regression benchmarks via CodSpeed in instruction-count (simulation) mode: deterministic, so the whole
+# suite is gate-able (no walltime noise, no gated/informational split).
 #
-# TOKENLESS FOR NOW: the CodSpeed action's token is only needed to UPLOAD results to the CodSpeed
-# dashboard (the hosted PR-gate). Without it the action still RUNS every benchmark under Valgrind and
-# reports the instruction counts in the job log. To turn on the hosted regression gate later: create a
-# CodSpeed project for the repo and either (public repo) rely on the OIDC `id-token: write` permission
-# below, or add a `CODSPEED_TOKEN` repo secret and pass `token: ${{ secrets.CODSPEED_TOKEN }}` to the action.
+# TOKENLESS: the token is only for uploading to the CodSpeed dashboard. Without it the action still runs every
+# benchmark and reports counts in the job log. For the hosted gate later, create a CodSpeed project and rely on
+# the OIDC id-token permission below (public repo), or add a CODSPEED_TOKEN secret and pass token: to the action.
 #
-# Why simulation (instruction-count) and not walltime: instruction counts are deterministic even for the
-# multi-threaded / engine-heavy paths (Valgrind serializes and counts), so the whole suite is gate-able and
-# there is no need to split "gated vs informational" the way noisy local walltime required. Instruction count
-# is exactly the signal that would have caught the LIST/ARRAY df() regression cleanly.
-#
-# NOTE: this workflow has not been run in CI yet; the build steps mirror the documented dev build (CLAUDE.md)
-# and will likely need a shakeout run. Valgrind is slow (~20-50x); if the full suite is too slow, trim the
-# largest-N benchmarks or run a curated subset.
+# Not yet run in CI; the build mirrors the dev build (CLAUDE.md) and will need a shakeout. Valgrind is slow
+# (~20-50x); trim the largest-N benchmarks if the suite is too slow.
 
 name: Benchmarks
 
diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py
index e6fc43e0..244663bc 100644
--- a/benchmarks/test_arrow_perf.py
+++ b/benchmarks/test_arrow_perf.py
@@ -1,41 +1,46 @@
-"""Standalone CodSpeed benchmark module for the Arrow read/write binding paths — NOT integrated
-(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
-
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python
-  cd /Users/evert/projects/duckdb-python/wt-cutover
-  $M -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts=
-  $C -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts=
-
-DESIGN — the data must be FULLY MOVED, not lazily wrapped, or the benchmark measures nothing:
-  * READ (arrow -> duckdb): the duckdb ENGINE must scan every value. We aggregate over the actual
-    columns (sum/length), NOT count(*) -- count(*) is answered from arrow metadata without touching data.
-  * WRITE (duckdb -> arrow): the CONSUMER must materialize everything.
-      - to_arrow_table() / pl() are EAGER (the full table / polars DataFrame is built).
-      - to_arrow_reader() is LAZY -- duckdb only produces a batch when it is pulled -- so we iterate the
-        whole stream to actually exercise and consume the write path.
-
-pyarrow/polars are pinned to the SAME version in both .venv-release, so the A/B delta is purely the binding.
+"""CodSpeed benchmark: Arrow read/write paths. Standalone, not in CI.
+
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_arrow_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+Data must be fully moved or nothing is measured: READ aggregates over real columns (sum/length, not count(*),
+which arrow answers from metadata); WRITE materializes the result (to_arrow_reader is lazy, so it is drained).
 """
 
-import duckdb
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pyarrow as pa
 import pytest
 
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
+
 N = 500_000
 WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)"
 WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)"
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
 @pytest.fixture(scope="module")
-def arrow_numeric():
+def arrow_numeric() -> pa.Table:
+    """Return a two-column numeric arrow table."""
     return pa.table(
         {
             "a": pa.array(range(N), type=pa.int64()),
@@ -45,12 +50,14 @@ def arrow_numeric():
 
 
 @pytest.fixture(scope="module")
-def arrow_string():
+def arrow_string() -> pa.Table:
+    """Return a single-column string arrow table."""
     return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())})
 
 
 @pytest.fixture(scope="module")
-def arrow_numeric_batches(arrow_numeric):
+def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.RecordBatch]]:
+    """Return the schema and record batches for the numeric table."""
     # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round
     return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000)
 
@@ -60,12 +67,16 @@ def arrow_numeric_batches(arrow_numeric):
 # --------------------------------------------------------------------------- #
 
 
-def test_read_arrow_numeric(benchmark, con, arrow_numeric):
+def test_read_arrow_numeric(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric: pa.Table
+) -> None:
+    """Benchmark scanning a numeric arrow table."""
     con.register("t_num", arrow_numeric)
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall())
 
 
-def test_read_arrow_string(benchmark, con, arrow_string):
+def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None:
+    """Benchmark scanning a string arrow table."""
     con.register("t_str", arrow_string)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall())
 
@@ -75,10 +86,15 @@ def test_read_arrow_string(benchmark, con, arrow_string):
 # full scan of every value.
 
 
-def test_read_arrow_reader_numeric(benchmark, con, arrow_numeric_batches):
+def test_read_arrow_reader_numeric(
+    benchmark: BenchmarkFixture,
+    con: duckdb.DuckDBPyConnection,
+    arrow_numeric_batches: tuple[pa.Schema, list[pa.RecordBatch]],
+) -> None:
+    """Benchmark scanning a streaming record-batch reader."""
     schema, batches = arrow_numeric_batches
 
-    def run():
+    def run() -> list:
         reader = pa.RecordBatchReader.from_batches(schema, iter(batches))
         con.register("t_rdr", reader)
         return con.execute("SELECT sum(a), sum(b) FROM t_rdr").fetchall()
@@ -92,16 +108,20 @@ def run():
 # --------------------------------------------------------------------------- #
 
 
-def test_write_arrow_table_numeric(benchmark, con):
+def test_write_arrow_table_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a numeric result to an arrow table."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table())
 
 
-def test_write_arrow_table_string(benchmark, con):
+def test_write_arrow_table_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a string result to an arrow table."""
     benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table())
 
 
-def test_write_arrow_reader_consumed(benchmark, con):
-    def run():
+def test_write_arrow_reader_consumed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark draining a lazy arrow record-batch reader."""
+
+    def run() -> int:
         reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000)
         rows = 0
         for batch in reader:  # drain the lazy stream so duckdb actually produces every batch
@@ -111,9 +131,11 @@ def run():
     benchmark(run)
 
 
-def test_write_polars_numeric(benchmark, con):
+def test_write_polars_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a numeric result to a polars frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).pl())
 
 
-def test_write_polars_string(benchmark, con):
+def test_write_polars_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a string result to a polars frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).pl())
diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py
index 3fe4ee0d..bf49dfc1 100644
--- a/benchmarks/test_cardinality_perf.py
+++ b/benchmarks/test_cardinality_perf.py
@@ -1,33 +1,38 @@
-"""Standalone CodSpeed benchmark: the RESULT-CARDINALITY (rows-to-Python) sweep. Run under each build:
+"""CodSpeed benchmark: the result-cardinality (rows-to-Python) sweep. Standalone, not in CI.
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
   cd /Users/evert/projects/duckdb-python/wt-codspeed
-  $M -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-  $C -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-
-REDESIGN NOTE: the first version swept `ORDER BY a DESC LIMIT n` over a fixed source. That was wrong:
-the engine's full top-N SORT (~3-14ms, itself variable) dominated and swamped the per-row conversion
-signal, and the numbers came out non-monotone. This version pre-materializes the fixed source table ONCE
-and sweeps `SELECT * FROM src LIMIT n` with NO ORDER BY: a plain LIMIT early-stops the scan at n rows, so
-the engine cost is cheap and monotone in n, and the rows-to-Python CONVERSION is the dominant n-varying
-cost. That gives a clean, monotone per-row slope; the A/B delta at each n isolates the binding, and a build
-whose slope is steeper has a per-row conversion regression. n=100 is the overhead regime (the natural
-instruction-count-gate point); n=100_000 is throughput.
-
-3 columns (BIGINT, DOUBLE, VARCHAR) so per-row conversion is non-trivial. numpy/pandas/pyarrow are pinned to
-the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_cardinality_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+Sweeps `SELECT * FROM src LIMIT n` (no ORDER BY) over a pre-materialized 3-column source: a plain LIMIT
+early-stops the scan, so the per-row conversion dominates and the slope is monotone in n. A steeper slope on
+one build is a per-row conversion regression. n=100 is the overhead regime, n=100_000 is throughput.
+(An earlier ORDER BY version was dropped: the top-N sort swamped the signal.)
 """
 
-import duckdb
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pytest
 
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
+
 SRC_ROWS = 200_000
 LIMITS = [100, 1_000, 10_000, 100_000]
 
 
 @pytest.fixture(scope="module")
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a connection over a once-materialized source table."""
     # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be
     # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows.
     c = duckdb.connect()
@@ -39,28 +44,31 @@ def con():
     c.close()
 
 
-def _query(n):
+def _query(n: int) -> str:
     # No ORDER BY: a plain LIMIT early-stops the scan at n rows -> engine cost cheap and monotone in n, so the
     # per-row binding conversion dominates the n-varying signal (unlike the old ORDER BY top-N sort).
     return f"SELECT a, b, s FROM src LIMIT {n}"
 
 
 @pytest.mark.parametrize("n", LIMITS)
-def test_limit_fetchall(benchmark, con, n):
+def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
+    """Benchmark fetchall over a LIMIT n sweep."""
     q = _query(n)
     con.execute(q).fetchall()  # warm
     benchmark(lambda: con.execute(q).fetchall())
 
 
 @pytest.mark.parametrize("n", LIMITS)
-def test_limit_df(benchmark, con, n):
+def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
+    """Benchmark df() over a LIMIT n sweep."""
     q = _query(n)
     con.sql(q).df()  # warm
     benchmark(lambda: con.sql(q).df())
 
 
 @pytest.mark.parametrize("n", LIMITS)
-def test_limit_to_arrow(benchmark, con, n):
+def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
+    """Benchmark to_arrow_table() over a LIMIT n sweep."""
     q = _query(n)
     con.sql(q).to_arrow_table()  # warm
     benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py
index 8c8ef20a..94a53c30 100644
--- a/benchmarks/test_fetch_perf.py
+++ b/benchmarks/test_fetch_perf.py
@@ -1,57 +1,70 @@
-"""Standalone CodSpeed benchmark module — NOT integrated (not in pyproject, not in CI, not committed).
+"""CodSpeed benchmark: row fetch paths (fetchall, fetchone iteration, expression construction). Standalone, not in CI.
 
-Purpose: A/B the binding-layer perf between the two builds (pybind11 `main` vs nanobind cutover), in particular
-the narrow-column `fetchall` regression. Run the SAME file under each build's interpreter and compare:
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_fetch_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python
-  cd /Users/evert/projects/duckdb-python/wt-cutover
-  $M -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts=
-  $C -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts=
-
-NOTE: macOS arm64 has no Valgrind, so only `--codspeed-mode=walltime` works locally (wall-clock stats). The
-deterministic instruction-count mode (`--codspeed-mode=simulation`) needs Linux + the CodSpeed instrument
-(CI, or `codspeed run` in a Linux container). In CI/cloud, CodSpeed compares each run against a git baseline;
-locally we get the same benchmark workflow but A/B by running the file under the two interpreters by hand.
+Only walltime works locally (no Valgrind on macOS arm64); the deterministic instruction-count mode needs Linux (CI).
+Walltime is noisy on sub-ms benchmarks.
 """
 
-import duckdb
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pytest
 
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
+
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
-def _bench_fetchall(benchmark, con, query):
+def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.execute(query).fetchall()  # warm the engine before measuring
     benchmark(lambda: con.execute(query).fetchall())
 
 
-def test_fetchall_int(benchmark, con):
+def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a single BIGINT column."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(200000) t(i)")
 
 
-def test_fetchall_smallint(benchmark, con):
+def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a single INTEGER column."""
     _bench_fetchall(benchmark, con, "SELECT (i % 100)::INTEGER AS a FROM range(200000) t(i)")
 
 
-def test_fetchall_double(benchmark, con):
+def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a single DOUBLE column."""
     _bench_fetchall(benchmark, con, "SELECT (i * 1.5)::DOUBLE AS a FROM range(200000) t(i)")
 
 
-def test_fetchall_2int(benchmark, con):
+def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of two BIGINT columns."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(200000) t(i)")
 
 
-def test_fetchall_str(benchmark, con):
+def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a single VARCHAR column."""
     _bench_fetchall(benchmark, con, "SELECT ('str_value_' || i) AS s FROM range(100000) t(i)")
 
 
-def test_fetchall_mixed(benchmark, con):
+def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a mixed scalar/list/struct row."""
     query = (
         "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, "
         "{'a': i, 'b': i + 1} AS st FROM range(50000) t(i)"
@@ -59,10 +72,11 @@ def test_fetchall_mixed(benchmark, con):
     _bench_fetchall(benchmark, con, query)
 
 
-def test_fetchone_iter(benchmark, con):
+def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark iterating a result one row at a time with fetchone."""
     query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)"
 
-    def run():
+    def run() -> None:
         rel = con.execute(query)
         while rel.fetchone() is not None:
             pass
@@ -78,35 +92,40 @@ def run():
 # --------------------------------------------------------------------------- #
 
 
-def test_fetchall_int_gate(benchmark, con):
+def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark the small-N BIGINT instruction-count gate."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)")
 
 
-def test_fetchall_2int_gate(benchmark, con):
+def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark the small-N two-BIGINT instruction-count gate."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)")
 
 
-def test_fetchall_null_heavy(benchmark, con):
-    _bench_fetchall(
-        benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)"
-    )
+def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a half-NULL BIGINT column."""
+    _bench_fetchall(benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)")
 
 
-def test_fetchall_timestamptz(benchmark, con):
+def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a TIMESTAMPTZ column."""
     _bench_fetchall(
         benchmark, con, "SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range(100000) t(i)"
     )
 
 
-def test_fetchall_decimal128(benchmark, con):
+def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a 128-bit DECIMAL column."""
     _bench_fetchall(benchmark, con, "SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range(200000) t(i)")
 
 
-def test_fetchall_blob(benchmark, con):
+def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a BLOB column."""
     _bench_fetchall(benchmark, con, "SELECT ('blob_value_' || i)::BLOB FROM range(100000) t(i)")
 
 
-def test_fetchall_mixed_wide(benchmark, con):
+def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchall of a heterogeneous wide-type row."""
     # heterogeneous row -> per-cell type dispatch in the Fetchone column loop (distinct branch/cache profile
     # from the homogeneous single-type columns above)
     query = (
@@ -116,10 +135,11 @@ def test_fetchall_mixed_wide(benchmark, con):
     _bench_fetchall(benchmark, con, query)
 
 
-def test_fetchmany_batched(benchmark, con):
+def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark draining a result with batched fetchmany."""
     query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)"
 
-    def run():
+    def run() -> None:
         rel = con.execute(query)
         while True:
             rows = rel.fetchmany(10_000)
@@ -129,8 +149,10 @@ def run():
     benchmark(run)
 
 
-def test_expr_many(benchmark):
-    def run():
+def test_expr_many(benchmark: BenchmarkFixture) -> None:
+    """Benchmark building many column/constant expressions."""
+
+    def run() -> int:
         out = []
         for i in range(2000):
             col = duckdb.ColumnExpression(f"col_{i}")
diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py
index 4fca641a..e3f232cc 100644
--- a/benchmarks/test_ingest_native_perf.py
+++ b/benchmarks/test_ingest_native_perf.py
@@ -1,57 +1,63 @@
-"""Standalone CodSpeed benchmark module for NATIVE Python-object ingest (Python list/tuple/dict -> duckdb) —
-NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+"""CodSpeed benchmark: native Python-object ingest (list/tuple/dict -> duckdb). Standalone, not in CI.
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
   cd /Users/evert/projects/duckdb-python/wt-codspeed
-  $M -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-  $C -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_ingest_native_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+Every cell goes through TransformPythonValue; dicts recurse to STRUCT; executemany re-binds per row. Note: one
+list arg to values() is ONE row whose columns are the list items, so a list of N items transforms N cells.
+executemany writes to a real table (CREATE OR REPLACE each round so it doesn't grow across repeats).
+"""
 
-WHY THIS MODULE: native Python-object ingest had ZERO coverage. Every cell goes through TransformPythonValue
-and the GetPythonObjectType ladder (python_conversion.cpp); dicts recurse through TransformDictionaryToStruct;
-executemany re-binds a parameter set per row (pyconnection.cpp ExecuteMany loop).
+from __future__ import annotations
 
-FULL MATERIALIZE: executemany lands N rows in a real table (CREATE OR REPLACE each round so the table does not
-grow across codspeed's repeated invocations). values() builds the value vectors EAGERLY inside the call
-(TransformPythonParamList), and we drain the resulting relation with fetchall so the round-trip is complete.
+from typing import TYPE_CHECKING
 
-NOTE on values() shape: a single list argument to values() becomes ONE row whose COLUMNS are the list items
-(see DuckDBPyConnection::Values, pyconnection.cpp) -- so a list of N scalars is 1 row x N columns and runs
-TransformPythonValue N times; a list of N tuples is 1 row x N nested(LIST) columns; a list of N dicts is
-1 row x N STRUCT columns (TransformDictionaryToStruct). All three exercise the per-cell transform N times.
-"""
+import pytest
 
 import duckdb
-import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
 
 EXECMANY_N = 20_000  # executemany re-binds + executes per row, keep moderate
 WIDE_N = 10_000  # values() builds a 1-row x N-col relation; cap N so the binder stays sane
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
 @pytest.fixture(scope="module")
-def rows_3col():
+def rows_3col() -> list[tuple[int, float, str]]:
+    """Return parameter rows for a 3-column executemany."""
     return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)]
 
 
 @pytest.fixture(scope="module")
-def scalars_wide():
-    return [i for i in range(WIDE_N)]
+def scalars_wide() -> list[int]:
+    """Return a wide row of scalar ints for values()."""
+    return list(range(WIDE_N))
 
 
 @pytest.fixture(scope="module")
-def tuples_wide():
+def tuples_wide() -> list[tuple[int, int, int]]:
+    """Return a wide row of tuples for values()."""
     return [(i, i + 1, i + 2) for i in range(WIDE_N)]
 
 
 @pytest.fixture(scope="module")
-def dicts_wide():
+def dicts_wide() -> list[dict[str, int | str]]:
+    """Return a wide row of dicts for values()."""
     return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)]
 
 
@@ -60,11 +66,14 @@ def dicts_wide():
 # --------------------------------------------------------------------------- #
 
 
-def test_ingest_executemany_3col(benchmark, con, rows_3col):
+def test_ingest_executemany_3col(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, rows_3col: list[tuple[int, float, str]]
+) -> None:
+    """Benchmark executemany INSERT of 3-column rows."""
     con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
     con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)  # warm
 
-    def run():
+    def run() -> None:
         con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
         con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)
 
@@ -76,18 +85,27 @@ def run():
 # --------------------------------------------------------------------------- #
 
 
-def test_ingest_values_scalars(benchmark, con, scalars_wide):
+def test_ingest_values_scalars(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, scalars_wide: list[int]
+) -> None:
+    """Benchmark values() over a wide row of scalars."""
     con.values(scalars_wide).fetchall()  # warm
     benchmark(lambda: con.values(scalars_wide).fetchall())
 
 
-def test_ingest_values_tuples(benchmark, con, tuples_wide):
+def test_ingest_values_tuples(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, tuples_wide: list[tuple[int, int, int]]
+) -> None:
+    """Benchmark values() over a wide row of tuples."""
     # each tuple cell -> LIST value (TransformPythonValue recursion)
     con.values(tuples_wide).fetchall()  # warm
     benchmark(lambda: con.values(tuples_wide).fetchall())
 
 
-def test_ingest_values_dicts(benchmark, con, dicts_wide):
+def test_ingest_values_dicts(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, dicts_wide: list[dict[str, int | str]]
+) -> None:
+    """Benchmark values() over a wide row of dicts."""
     # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion)
     con.values(dicts_wide).fetchall()  # warm
     benchmark(lambda: con.values(dicts_wide).fetchall())
diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py
index bb5fc1e8..abbe2a4d 100644
--- a/benchmarks/test_ingest_numpy_perf.py
+++ b/benchmarks/test_ingest_numpy_perf.py
@@ -1,31 +1,32 @@
-"""Standalone CodSpeed benchmark module for the NUMPY ingest paths (numpy / numpy-backed pandas -> duckdb)
-— NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+"""CodSpeed benchmark: numpy ingest paths (numpy / numpy-backed pandas -> duckdb). Standalone, not in CI.
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
   cd /Users/evert/projects/duckdb-python/wt-codspeed
-  $M -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-  $C -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-
-WHY THIS MODULE: the numpy scan (NumpyScan / NumpyArray facade / RawArrayWrapper / pandas-bind / analyzer) is
-the IN-numpy half the nanobind cutover reworked, and several of its branches were untested:
-  * I0-2 object-string scan: the per-row isinstance + PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode
-    transcode ladder (numpy_scan.cpp). GOTCHA (encoded): a meaningful benchmark MUST mix ASCII + non-ASCII +
-    a null sentinel -- ASCII-only misses the transcode + null-detection ladder entirely.
-  * I0-1 double NaN->NULL loop (numpy_scan.cpp) -- the reworked float path.
-  * NULL-heavy masked scan: ScanNumpyMasked + ApplyMask (pandas nullable Int64).
-  * I1-3 analyzer bind: PandasAnalyzer::Analyze samples rows through the GetItemType ladder. This is a per-BIND
-    cost, independent of row count, so it is the ONE place count(*) is the correct consume (the cost is at bind,
-    not scan); every other READ here aggregates over real columns (sum/length) to force a full engine scan.
-  * I1-8 numpy ndarray / dict-of-arrays via the replacement scan (resolved from a module global).
-
-numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_ingest_numpy_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+Covers the object-string scan (ASCII zero-copy vs transcode ladder), the NaN->NULL float loop, the masked
+scan, and analyzer bind. Gotchas: the object-string benchmark MUST mix ASCII + non-ASCII + a null or it misses
+the ladder; analyzer bind is the one place count(*) is correct (cost is at bind, not scan) while every other
+READ aggregates over real columns.
 """
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
 import duckdb
 import numpy as np
 import pandas as pd
-import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
 
 N = 500_000
 ANALYZER_N = 200_000
@@ -42,26 +43,30 @@
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
 @pytest.fixture(scope="module")
-def df_double_with_nan():
+def df_double_with_nan() -> pd.DataFrame:
+    """Return a numpy-backed double frame with real NaNs."""
     a = np.arange(N, dtype="float64") * 1.5
     a[::10] = np.nan  # real NaNs -> NaN->NULL conversion loop
     return pd.DataFrame({"a": a})
 
 
 @pytest.fixture(scope="module")
-def df_object_string_mixed():
+def df_object_string_mixed() -> pd.DataFrame:
+    """Return an object-string frame mixing ASCII, non-ASCII, and nulls."""
     return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)})
 
 
 @pytest.fixture(scope="module")
-def df_masked_int():
+def df_masked_int() -> pd.DataFrame:
+    """Return a nullable-Int64 frame that scans masked."""
     # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask
     arr = pd.array(np.arange(N), dtype="Int64")
     arr[::10] = pd.NA
@@ -69,7 +74,8 @@ def df_masked_int():
 
 
 @pytest.fixture(scope="module")
-def df_object_mixed_types():
+def df_object_mixed_types() -> pd.DataFrame:
+    """Return an object frame of mixed python types for analyzer bind."""
     return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)})
 
 
@@ -78,21 +84,31 @@ def df_object_mixed_types():
 # --------------------------------------------------------------------------- #
 
 
-def test_read_numpy_dict_numeric(benchmark, con):
+def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark scanning a numpy-dict replacement scan."""
     benchmark(lambda: con.sql("SELECT sum(a), sum(b) FROM NPDICT").fetchall())
 
 
-def test_read_numpy_double_with_nan(benchmark, con, df_double_with_nan):
+def test_read_numpy_double_with_nan(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame
+) -> None:
+    """Benchmark scanning a numpy double column with NaNs."""
     con.register("t", df_double_with_nan)
     benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
 
 
-def test_read_numpy_masked_int(benchmark, con, df_masked_int):
+def test_read_numpy_masked_int(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame
+) -> None:
+    """Benchmark scanning a masked nullable-int column."""
     con.register("t", df_masked_int)
     benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
 
 
-def test_read_numpy_object_string_mixed(benchmark, con, df_object_string_mixed):
+def test_read_numpy_object_string_mixed(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame
+) -> None:
+    """Benchmark scanning a mixed object-string column."""
     con.register("t", df_object_string_mixed)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
@@ -103,7 +119,10 @@ def test_read_numpy_object_string_mixed(benchmark, con, df_object_string_mixed):
 # --------------------------------------------------------------------------- #
 
 
-def test_bind_analyzer_object(benchmark, con, df_object_mixed_types):
+def test_bind_analyzer_object(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame
+) -> None:
+    """Benchmark the analyzer bind of a mixed-type object column."""
     con.register("t", df_object_mixed_types)
     con.execute("SELECT count(*) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall())
diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py
index 34a0948d..1a4c09f0 100644
--- a/benchmarks/test_pandas_perf.py
+++ b/benchmarks/test_pandas_perf.py
@@ -1,32 +1,32 @@
-"""Standalone CodSpeed benchmark module for the pandas read/write binding paths, comparing NUMPY-backed vs
-ARROW-backed DataFrames — NOT integrated (not in pyproject, not in CI, not committed). Run under each build:
-
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python
-  cd /Users/evert/projects/duckdb-python/wt-cutover
-  $M -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts=
-  $C -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts=
-
-WHY BOTH BACKINGS: when duckdb scans a pandas DataFrame, the binding path depends on each column's backing:
-  * numpy-backed columns (dtype int64 / float64 / object) -> the NUMPY scan path (NumpyArray facade,
-    RawArrayWrapper, pandas/bind.cpp, analyzer.cpp) -- this is the path the nanobind cutover reworked
-    NON-TRIVIALLY, so it gets first-class coverage here.
-  * arrow-backed columns (pandas ArrowDtype, e.g. int64[pyarrow]) -> the ARROW scan path (near zero-copy).
-On the WRITE side, duckdb's native pandas output (rel.df()) is NUMPY-backed; an arrow-backed pandas frame is
-produced via duckdb-arrow + pyarrow.to_pandas(ArrowDtype) (pyarrow.to_pandas is identical on both builds, so
-the A/B delta is still the duckdb binding).
-
-FULL CONSUME (same discipline as the arrow module): READ aggregates over the actual columns (sum/length, NOT
-count(*) which is answered from metadata), and WRITE materializes the entire DataFrame.
-
-numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+"""CodSpeed benchmark: pandas read/write, numpy-backed vs arrow-backed DataFrames. Standalone, not in CI.
+
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_pandas_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+The binding path depends on column backing: numpy-backed columns take the NumpyArray scan path, arrow-backed
+(pandas ArrowDtype) take the near-zero-copy arrow path. Full consume: READ aggregates over real columns (not
+count(*)), WRITE materializes the whole frame.
 """
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+import pytest
+
 import duckdb
 import numpy as np
 import pandas as pd
-import pyarrow as pa
-import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
 
 N = 500_000
 WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)"
@@ -35,25 +35,29 @@
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
 @pytest.fixture(scope="module")
-def df_numpy_numeric():
+def df_numpy_numeric() -> pd.DataFrame:
+    """Return a numpy-backed numeric frame."""
     return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5})
 
 
 @pytest.fixture(scope="module")
-def df_numpy_string():
+def df_numpy_string() -> pd.DataFrame:
+    """Return a numpy-backed object-string frame."""
     # explicit object dtype -> classic numpy-backed object-string column (the reworked object/analyzer path)
     return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)})
 
 
 @pytest.fixture(scope="module")
-def df_arrow_numeric():
+def df_arrow_numeric() -> pd.DataFrame:
+    """Return an arrow-backed numeric frame."""
     return pd.DataFrame(
         {
             "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())),
@@ -63,7 +67,8 @@ def df_arrow_numeric():
 
 
 @pytest.fixture(scope="module")
-def df_arrow_string():
+def df_arrow_string() -> pd.DataFrame:
+    """Return an arrow-backed string frame."""
     return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))})
 
 
@@ -72,22 +77,34 @@ def df_arrow_string():
 # --------------------------------------------------------------------------- #
 
 
-def test_read_pandas_numpy_numeric(benchmark, con, df_numpy_numeric):
+def test_read_pandas_numpy_numeric(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame
+) -> None:
+    """Benchmark scanning a numpy-backed numeric frame."""
     con.register("t", df_numpy_numeric)
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
 
 
-def test_read_pandas_numpy_string(benchmark, con, df_numpy_string):
+def test_read_pandas_numpy_string(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame
+) -> None:
+    """Benchmark scanning a numpy-backed string frame."""
     con.register("t", df_numpy_string)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
-def test_read_pandas_arrow_numeric(benchmark, con, df_arrow_numeric):
+def test_read_pandas_arrow_numeric(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame
+) -> None:
+    """Benchmark scanning an arrow-backed numeric frame."""
     con.register("t", df_arrow_numeric)
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
 
 
-def test_read_pandas_arrow_string(benchmark, con, df_arrow_string):
+def test_read_pandas_arrow_string(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame
+) -> None:
+    """Benchmark scanning an arrow-backed string frame."""
     con.register("t", df_arrow_string)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
@@ -99,11 +116,13 @@ def test_read_pandas_arrow_string(benchmark, con, df_arrow_string):
 # --------------------------------------------------------------------------- #
 
 
-def test_write_pandas_numpy_numeric(benchmark, con):
+def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a numeric result to a numpy-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).df())
 
 
-def test_write_pandas_numpy_string(benchmark, con):
+def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a string result to a numpy-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).df())
 
 
@@ -112,7 +131,8 @@ def test_write_pandas_numpy_string(benchmark, con):
 # datetime column (TimestampConvert + ConvertDateTimeTypes).
 
 
-def test_write_pandas_numpy_numeric_with_nulls(benchmark, con):
+def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a null-heavy numeric result to a numpy-backed frame."""
     q = (
         "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
         "CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range(500000) t(i)"
@@ -120,14 +140,17 @@ def test_write_pandas_numpy_numeric_with_nulls(benchmark, con):
     benchmark(lambda: con.sql(q).df())
 
 
-def test_write_pandas_numpy_timestamp(benchmark, con):
+def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a timestamp result to a numpy-backed frame."""
     q = "SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range(500000) t(i)"
     benchmark(lambda: con.sql(q).df())
 
 
-def test_write_pandas_arrow_numeric(benchmark, con):
+def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a numeric result to an arrow-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
 
 
-def test_write_pandas_arrow_string(benchmark, con):
+def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark materializing a string result to an arrow-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py
index cfe5c281..5ad56254 100644
--- a/benchmarks/test_produce_numpy_perf.py
+++ b/benchmarks/test_produce_numpy_perf.py
@@ -1,36 +1,33 @@
-"""Standalone CodSpeed benchmark module for the COLUMNAR produce paths (duckdb -> numpy/pandas), i.e. df(),
-fetchnumpy(), fetch_df_chunk() — NOT integrated (not in pyproject, not in CI, not committed). Run under each
-build's interpreter and compare:
+"""CodSpeed benchmark: columnar produce paths (df(), fetchnumpy(), fetch_df_chunk()). Standalone, not in CI.
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
   cd /Users/evert/projects/duckdb-python/wt-codspeed
-  $M -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-  $C -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-
-WHY THIS MODULE: the columnar OUT path (FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular) is exactly
-what the nanobind cutover reworked. The under-covered cases are: (1) the WITH-NULLS branch (HAS_NULLS=true ->
-masked_array build -> masked->pd.NA rewrite, array_wrapper.cpp / pyresult.cpp) -- NEVER previously benchmarked
-and the most-changed code; (2) datetime; (3) fetchnumpy without the DataFrame wrap; (4) fetch_df_chunk; and
-the wide-internal types HUGEINT (->double cast), UUID (UUIDConvert), DECIMAL(28,x) (ConvertDecimalInternal
-<hugeint_t>) that exercise distinct OUT-col converters.
-
-GOTCHA (encoded below): OUT-col NULL benchmarks use REAL DuckDB nulls (CASE WHEN .. THEN NULL). A no-null
-column silently takes the cheap std::move path and the masked-array branch never triggers, so it would measure
-the wrong thing.
-
-FULL CONSUME: df() / fetchnumpy() eagerly materialize the whole column set; fetch_df_chunk is drained in a loop.
-
-numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_produce_numpy_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+Covers the with-NULLS branch (masked_array build), datetime, and wide-internal types (hugeint/uuid/decimal128).
+Gotcha: NULL benchmarks use real DuckDB nulls (CASE WHEN); a no-null column takes the cheap path and measures
+the wrong thing. Full consume: df()/fetchnumpy() materialize the columns; fetch_df_chunk is drained in a loop.
 """
 
+from __future__ import annotations
+
 import gc
 import sys
 import tracemalloc
+from typing import TYPE_CHECKING
+
+import pytest
 
 import duckdb
 import numpy as np  # noqa: F401  (pinned identically A/B; imported so the env matches the other modules)
-import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
 
 N = 500_000
 TYPE_N = 200_000  # wide-internal types (hugeint/uuid/decimal128) are heavier per cell
@@ -48,18 +45,19 @@
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
-def _bench_df(benchmark, con, query):
+def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.sql(query).df()  # warm
     benchmark(lambda: con.sql(query).df())
 
 
-def _bench_numpy(benchmark, con, query):
+def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.sql(query).fetchnumpy()  # warm
     benchmark(lambda: con.sql(query).fetchnumpy())
 
@@ -69,32 +67,39 @@ def _bench_numpy(benchmark, con, query):
 # --------------------------------------------------------------------------- #
 
 
-def test_df_numeric(benchmark, con):
+def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a numeric result."""
     _bench_df(benchmark, con, Q_NUM)
 
 
-def test_df_numeric_with_nulls(benchmark, con):
+def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a null-heavy numeric result."""
     # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch)
     _bench_df(benchmark, con, Q_NUM_NULLS)
 
 
-def test_df_string(benchmark, con):
+def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a string result."""
     _bench_df(benchmark, con, Q_STR)
 
 
-def test_df_timestamp(benchmark, con):
+def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a timestamp result."""
     _bench_df(benchmark, con, Q_TS)
 
 
-def test_df_hugeint(benchmark, con):
+def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a hugeint result."""
     _bench_df(benchmark, con, Q_HUGEINT)
 
 
-def test_df_uuid(benchmark, con):
+def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a uuid result."""
     _bench_df(benchmark, con, Q_UUID)
 
 
-def test_df_decimal128(benchmark, con):
+def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark df() of a 128-bit decimal result."""
     _bench_df(benchmark, con, Q_DEC128)
 
 
@@ -103,11 +108,13 @@ def test_df_decimal128(benchmark, con):
 # --------------------------------------------------------------------------- #
 
 
-def test_fetchnumpy_numeric(benchmark, con):
+def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchnumpy() of a numeric result."""
     _bench_numpy(benchmark, con, Q_NUM)
 
 
-def test_fetchnumpy_numeric_with_nulls(benchmark, con):
+def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark fetchnumpy() of a null-heavy numeric result."""
     _bench_numpy(benchmark, con, Q_NUM_NULLS)
 
 
@@ -116,8 +123,10 @@ def test_fetchnumpy_numeric_with_nulls(benchmark, con):
 # --------------------------------------------------------------------------- #
 
 
-def test_fetch_df_chunk_loop(benchmark, con):
-    def run():
+def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark draining a result with fetch_df_chunk()."""
+
+    def run() -> int:
         rel = con.sql(Q_NUM)
         rows = 0
         while True:
@@ -136,7 +145,8 @@ def run():
 # --------------------------------------------------------------------------- #
 
 
-def test_torch_numeric(benchmark, con):
+def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark torch() of a numeric result (skipped if torch is absent)."""
     pytest.importorskip("torch")
     q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)"
     con.sql(q).torch()  # warm
@@ -160,7 +170,8 @@ def test_torch_numeric(benchmark, con):
 # --------------------------------------------------------------------------- #
 
 
-def test_mem_df_with_nulls():
+def test_mem_df_with_nulls() -> None:
+    """Guard the Python-tracked peak allocation of a null-heavy df() call."""
     con = duckdb.connect()
     try:
         tracemalloc.start()
diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py
index 3e92f12d..7fb80c4b 100644
--- a/benchmarks/test_types_roundtrip_perf.py
+++ b/benchmarks/test_types_roundtrip_perf.py
@@ -1,31 +1,31 @@
-"""Standalone CodSpeed benchmark module: the TYPE x DIRECTION produce matrix — NOT integrated (not in
-pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+"""CodSpeed benchmark: the type x direction produce matrix (fetchall / df / to_arrow per type). Standalone, not in CI.
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
   cd /Users/evert/projects/duckdb-python/wt-codspeed
-  $M -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-  $C -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-
-WHY THIS MODULE: a single systematic sweep of one logical type per column across the three produce directions
-  * OUT-row   = fetchall()          -> FromValue per cell (python_objects.cpp)
-  * OUT-col   = df()                -> ArrayWrapper / ConvertColumn (array_wrapper.cpp)
-  * OUT-arrow = to_arrow_table()    -> arrow export converters
-so a regression localizes to (type, direction). Includes the iqmo/bareduckdb cross-check breadth that the
-narrow-numeric homogeneous benchmarks miss: HUGEINT (PyLong_FromString / hugeint->double / int128 export),
-UUID (uuid.UUID per row / UUIDConvert), DECIMAL(28,6) int128-internal (ConvertDecimalInternal<hugeint_t>),
-and a long-varchar (>64 chars) that shifts the string paths from overhead-bound to copy-bound.
-
-FULL CONSUME: fetchall and df materialize everything; to_arrow_table is eager. NOTE: to_arrow_table on a
-materialized result re-runs the query with the GIL released (PromoteMaterializedToArrow), so the OUT-arrow
-column is engine-parallel and walltime-NOISY -- treat it as informational, not a hard gate.
-
-numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_types_roundtrip_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+One logical type per column across three directions, so a regression localizes to (type, direction). Includes
+the wide types the narrow-numeric benchmarks miss: hugeint, uuid, decimal128, long varchar. Note: to_arrow on a
+materialized result re-runs the query with the GIL released, so the arrow column is engine-parallel and
+walltime-noisy: informational, not a hard gate.
 """
 
-import duckdb
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pytest
 
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
+
 N = 100_000
 
 # one logical type per column; long-varchar is intentionally > 64 chars
@@ -46,32 +46,36 @@
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
-def _query(type_name):
+def _query(type_name: str) -> str:
     return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)"
 
 
 @pytest.mark.parametrize("type_name", TYPES)
-def test_out_row_fetchall(benchmark, con, type_name):
+def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
+    """Benchmark fetchall of one logical type per column."""
     q = _query(type_name)
     con.execute(q).fetchall()  # warm
     benchmark(lambda: con.execute(q).fetchall())
 
 
 @pytest.mark.parametrize("type_name", TYPES)
-def test_out_col_df(benchmark, con, type_name):
+def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
+    """Benchmark df() of one logical type per column."""
     q = _query(type_name)
     con.sql(q).df()  # warm
     benchmark(lambda: con.sql(q).df())
 
 
 @pytest.mark.parametrize("type_name", TYPES)
-def test_out_arrow_table(benchmark, con, type_name):
+def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
+    """Benchmark to_arrow_table() of one logical type per column (informational only)."""
     # informational only: PromoteMaterializedToArrow re-runs the query with the GIL released (noisy)
     q = _query(type_name)
     con.sql(q).to_arrow_table()  # warm
diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py
index ef398ebb..34896bcc 100644
--- a/benchmarks/test_udf_perf.py
+++ b/benchmarks/test_udf_perf.py
@@ -1,28 +1,30 @@
-"""Standalone CodSpeed benchmark module for the Python UDF binding paths (src/python_udf.cpp) — NOT integrated
-(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare:
+"""CodSpeed benchmark: Python UDF paths (native scalar + vectorized arrow). Standalone, not in CI.
 
-  M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python
-  C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
   cd /Users/evert/projects/duckdb-python/wt-codspeed
-  $M -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
-  $C -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_udf_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
 
-WHY THIS MODULE: the whole UDF subsystem had ZERO benchmark coverage. The NATIVE scalar UDF is the single
-biggest untested per-call-overhead path in the binding -- per row it builds a TupleBuilder of args, calls
-PyObject_CallObject, and runs TransformPythonObject on the result (python_udf.cpp). The ARROW (vectorized) UDF
-is the columnar counterpart: ConvertDataChunkToPyArrowTable + the Python call + ConvertArrowTableToVector cast.
+Native scalar = one Python call per row (arg build + PyObject_CallObject + result transform); arrow = one call
+per chunk. Full consume: each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row.
+"""
 
-FULL CONSUME (same discipline as the other modules): every UDF benchmark wraps the call in a sum()/length()
-aggregate so the ENGINE evaluates the UDF on every row (count(*) would skip it). The aggregate output is a
-single row, so the measured cost is the per-row (native) / per-chunk (arrow) UDF invocation, not the fetch.
+from __future__ import annotations
 
-numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding.
-"""
+from typing import TYPE_CHECKING
 
-import duckdb
 import pytest
+
+import duckdb
 from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR
 
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
+
 pa = pytest.importorskip("pyarrow")
 pc = pytest.importorskip("pyarrow.compute")
 
@@ -31,13 +33,14 @@
 
 
 @pytest.fixture
-def con():
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh connection, closed on teardown."""
     c = duckdb.connect()
     yield c
     c.close()
 
 
-def _bench(benchmark, con, query):
+def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.execute(query).fetchall()  # warm the engine + import caches before measuring
     benchmark(lambda: con.execute(query).fetchall())
 
@@ -47,22 +50,26 @@ def _bench(benchmark, con, query):
 # --------------------------------------------------------------------------- #
 
 
-def test_udf_native_int_1arg(benchmark, con):
+def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark a 1-arg native int scalar UDF."""
     con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
     _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)")
 
 
-def test_udf_native_int_2arg(benchmark, con):
+def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark a 2-arg native int scalar UDF."""
     con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT)
     _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)")
 
 
-def test_udf_native_double_1arg(benchmark, con):
+def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark a 1-arg native double scalar UDF."""
     con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE)
     _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)")
 
 
-def test_udf_native_string(benchmark, con):
+def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark a native string scalar UDF."""
     con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR)
     _bench(
         benchmark,
@@ -71,7 +78,8 @@ def test_udf_native_string(benchmark, con):
     )
 
 
-def test_udf_native_null_inputs(benchmark, con):
+def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark the validity short-circuit for NULL inputs to a native UDF."""
     # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the
     # validity short-circuit, not the Python call, so the UDF only ever sees non-NULL rows.
     con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
@@ -88,17 +96,20 @@ def test_udf_native_null_inputs(benchmark, con):
 # --------------------------------------------------------------------------- #
 
 
-def test_udf_arrow_int(benchmark, con):
+def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark a vectorized arrow int UDF."""
     con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
     _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)")
 
 
-def test_udf_arrow_double(benchmark, con):
+def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark a vectorized arrow double UDF."""
     con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow")
     _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)")
 
 
-def test_udf_arrow_null_inputs(benchmark, con):
+def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Benchmark the selvec compaction for NULL inputs to a vectorized arrow UDF."""
     # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call
     # and reconstructs the result vector afterwards -- this is the selvec compaction/reconstruction cost.
     con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
diff --git a/pyproject.toml b/pyproject.toml
index 53cfa616..fd0ef328 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -250,6 +250,7 @@ test = [ # dependencies used for running tests
     "pytest-timeout",
     "pytest-timestamper",
     "pytest-xdist", # parallel test execution (-n auto); without this `uv sync --reinstall` prunes a manual install
+    "pytest_codspeed",
     "coverage",
     "gcovr; sys_platform != 'win32' or platform_machine != 'ARM64'",
     "gcsfs; sys_platform != 'win32' or platform_machine != 'ARM64'",

From 049b7e316afcff2b3f551b936eabdfca97ab1d97 Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Wed, 1 Jul 2026 18:29:07 +0200
Subject: [PATCH 4/7] more benchmarking

---
 .github/workflows/codspeed.yml                | 120 +++++--
 benchmarks/PLAN.md                            |  60 +++-
 benchmarks/_scale.py                          |  40 +++
 benchmarks/compare_baseline.py                | 338 ++++++++++++++++++
 benchmarks/conftest.py                        |  63 ++++
 benchmarks/requirements-bench.txt             |  29 ++
 benchmarks/test_arrow_perf.py                 |  54 ++-
 benchmarks/test_cardinality_perf.py           |  12 +-
 benchmarks/test_concurrency_perf.py           | 136 +++++++
 benchmarks/test_engine_control_perf.py        |  68 ++++
 benchmarks/test_fetch_perf.py                 |  76 ++--
 benchmarks/test_ingest_native_perf.py         |  21 +-
 benchmarks/test_ingest_numpy_perf.py          |  37 +-
 benchmarks/test_pandas_perf.py                |  40 ++-
 benchmarks/test_produce_numpy_perf.py         |  30 +-
 .../test_relational_construction_perf.py      |  43 +++
 benchmarks/test_types_roundtrip_perf.py       |  23 +-
 benchmarks/test_udf_perf.py                   |  28 +-
 pyproject.toml                                |  16 +
 tests/fast/test_binding_pressure_leak.py      | 113 ++++++
 20 files changed, 1177 insertions(+), 170 deletions(-)
 create mode 100644 benchmarks/_scale.py
 create mode 100644 benchmarks/compare_baseline.py
 create mode 100644 benchmarks/conftest.py
 create mode 100644 benchmarks/requirements-bench.txt
 create mode 100644 benchmarks/test_concurrency_perf.py
 create mode 100644 benchmarks/test_engine_control_perf.py
 create mode 100644 benchmarks/test_relational_construction_perf.py
 create mode 100644 tests/fast/test_binding_pressure_leak.py

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index c82465d2..2b7b1664 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -1,20 +1,42 @@
-# Perf-regression benchmarks via CodSpeed in instruction-count (simulation) mode: deterministic, so the whole
-# suite is gate-able (no walltime noise, no gated/informational split).
+# Perf-regression benchmarks: instruction-count (Callgrind) gating against a COMMITTED baseline.
 #
-# TOKENLESS: the token is only for uploading to the CodSpeed dashboard. Without it the action still runs every
-# benchmark and reports counts in the job log. For the hosted gate later, create a CodSpeed project and rely on
-# the OIDC id-token permission below (public repo), or add a CODSPEED_TOKEN secret and pass token: to the action.
+# NO CodSpeed account/dashboard/token/runner. pytest-codspeed's hooks call callgrind_dump_stats_at(<uri>) per
+# benchmark, so a self-hosted `valgrind --tool=callgrind` run writes one dump per benchmark, headed by
+# `desc: Trigger: Client Request: <uri>` with the count on `totals:` (events: Ir). benchmarks/compare_baseline.py
+# parses those dumps and diffs each benchmark against benchmarks/baseline.json (the committed instruction-count
+# baseline). Counts are near-deterministic under Callgrind with PYTHONHASHSEED pinned (~0.1% noise observed;
+# often bit-identical), so a 5% default gate threshold sits far above noise. Validated on a Linux+valgrind box.
 #
-# Not yet run in CI; the build mirrors the dev build (CLAUDE.md) and will need a shakeout. Valgrind is slow
-# (~20-50x); trim the largest-N benchmarks if the suite is too slow.
+# TRIGGERS: nightly `schedule` + manual `workflow_dispatch`. No pull_request/push (and no `paths:` -- neither
+# schedule nor dispatch honors it). A dispatch on a feature branch compares that branch's benchmark counts vs the
+# baseline.json committed on the branch (i.e. main's baseline), answering "did my branch regress vs main".
+#
+# MODES (workflow_dispatch input `regen`):
+#   regen=false (default) -> COMPARE: run + diff vs baseline.json, print a report. REPORT-ONLY for now (never
+#                            fails the job); flip compare_baseline.py to --enforce once trusted.
+#   regen=true            -> REGENERATE: run + write a fresh baseline.json (per-bench counts + provenance meta +
+#                            Option-B binding fractions/auto-move) and upload it as an artifact to commit
+#                            deliberately. Bump benchmarks/requirements-bench.txt in a separate commit FIRST if
+#                            the pins should change, then regen so the baseline matches the committed pins.
+#
+# The concurrency module is EXCLUDED from the Callgrind sweep: Callgrind serializes threads, so its signal
+# (wall-clock GIL contention) is meaningless here; it stays a local walltime tool.
+#
+# MEMORY MODE (a second Callgrind sweep for O(rows) produce peak-RSS) is DESIGNED but DEFERRED -- see PLAN.md.
+#
+# Valgrind is slow (~20-50x); timeout-minutes is a conservative guess -- calibrate after the first CI run.
 
 name: Benchmarks
 
 on:
-  pull_request:
-  push:
-    branches: [main]
+  schedule:
+    - cron: "0 3 * * *" # nightly at 03:00 UTC
   workflow_dispatch:
+    inputs:
+      regen:
+        description: "Regenerate benchmarks/baseline.json (upload as artifact) instead of comparing"
+        type: boolean
+        default: false
 
 concurrency:
   group: codspeed-${{ github.ref }}
@@ -23,25 +45,42 @@ concurrency:
 jobs:
   benchmarks:
     runs-on: ubuntu-latest
+    timeout-minutes: 90 # measured: ~25 min Callgrind sweep at BENCH_SCALE=10 (12-core Linux) + cold build ~10 min; margin for CI
     permissions:
       contents: read
-      id-token: write # enables tokenless (OIDC) upload once a CodSpeed project is linked; harmless otherwise
+    env:
+      PYTHONHASHSEED: "0" # pin hash randomization so dict/struct paths give stable instruction counts (INFRA-6)
+      CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks (the callgrind_dump_stats_at markers)
+      # env-gated row counts (INFRA-4): shrink the O(rows)/per-row-object benchmarks so the Callgrind sweep fits
+      # under timeout-minutes. Local runs leave this unset -> full N. Recorded in baseline.json meta.bench_scale;
+      # a baseline is only comparable to a run at the SAME scale. Calibrated on a 12-core Linux+valgrind box:
+      # BENCH_SCALE=10 -> ~25 min full sweep, and the Option-B move-list matches full-N (fractions shift slightly
+      # but stay the same side of the cutoff). Most benches floor at 20k rows (_scale.FLOOR), still row-dominated.
+      BENCH_SCALE: "10"
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: recursive # the DuckDB engine submodule is needed to build
           fetch-depth: 0 # setuptools_scm needs history for version detection
 
+      - name: Resolve DuckDB submodule SHA
+        id: duckdb_sha
+        # used for the sccache key AND passed to compare_baseline.py for the engine-bump guard
+        run: echo "sha=$(git rev-parse HEAD:external/duckdb)" >> "$GITHUB_OUTPUT"
+
       - name: Install uv
         uses: astral-sh/setup-uv@v5
         with:
           python-version: "3.13"
 
+      - name: Install valgrind
+        run: sudo apt-get update && sudo apt-get install -y valgrind
+
       - name: Cache sccache
         uses: actions/cache@v4
         with:
           path: ~/.cache/sccache
-          key: sccache-codspeed-${{ hashFiles('external/duckdb') }}
+          key: sccache-codspeed-${{ steps.duckdb_sha.outputs.sha }}
           restore-keys: sccache-codspeed-
 
       - name: Install sccache
@@ -49,19 +88,58 @@ jobs:
           curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \
             | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache
 
-      - name: Build the extension (release) + benchmark deps
+      - name: Build the extension (release) + pinned benchmark deps
         env:
           CMAKE_C_COMPILER_LAUNCHER: sccache
           CMAKE_CXX_COMPILER_LAUNCHER: sccache
         run: |
+          # step 1: build deps only (needed for --no-build-isolation), no project
           uv sync --only-group build --no-install-project -p 3.13
-          uv sync --no-build-isolation --no-editable --reinstall -p 3.13
-          # benchmark deps: keep these pinned in lockstep with any baseline you compare against, so the only
-          # cross-run delta is the binding (numpy/pandas/pyarrow/polars/pytz + the codspeed plugin).
-          uv pip install pytest pytest-codspeed numpy pandas pyarrow polars pytz
+          # step 2: build+install the project (release) + build group, WITHOUT the heavy default `dev` group
+          # (torch/tensorflow/pyspark). uv.lock is gitignored, so it is deliberately NOT relied on for bench deps.
+          uv sync --no-build-isolation --no-editable --reinstall --no-default-groups --group build -p 3.13
+          # step 3: install the FROZEN, committed bench pins (exact ==). Regenerated deliberately with the baseline
+          # (source list: pyproject [dependency-groups] bench), so the only cross-run delta is the binding.
+          uv pip install -r benchmarks/requirements-bench.txt
+
+      - name: Collect gate node-ids
+        # the gate/informational split (conftest markers) classifies which benchmarks are gate-able; regen uses it
+        run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \
+             | grep '::' > gate_list.txt || true
+
+      - name: Run benchmarks under Callgrind (per-benchmark instruction counts)
+        # ONE sweep over all gate+informational benchmarks EXCEPT the concurrency module (Callgrind serializes
+        # threads -> its wall-clock signal is meaningless and it is expensive). Each benchmark emits a callgrind
+        # dump keyed by its uri. The pytest-codspeed hooks obj-skip libpython, so counts are clean.
+        run: |
+          mkdir -p profiles
+          CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \
+            --callgrind-out-file="$PWD/profiles/cg.%p.%n" \
+            uv run --no-sync pytest benchmarks/ \
+              --ignore=benchmarks/test_concurrency_perf.py \
+              -m "gate or informational" --codspeed -o addopts= -p no:cacheprovider
+
+      - name: Compare against committed baseline (report-only)
+        if: ${{ !inputs.regen }}
+        # report-only for now: prints the per-benchmark delta table and NEVER fails the job. Add --enforce here
+        # once trusted to fail on a gate regression (informational benches never fail).
+        run: |
+          uv run --no-sync python benchmarks/compare_baseline.py compare \
+            --profiles profiles --baseline benchmarks/baseline.json \
+            --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \
+            --pins benchmarks/requirements-bench.txt
+
+      - name: Regenerate baseline (upload artifact to commit deliberately)
+        if: ${{ inputs.regen }}
+        run: |
+          uv run --no-sync python benchmarks/compare_baseline.py regen \
+            --profiles profiles --out benchmarks/baseline.json --gate-list gate_list.txt \
+            --git-commit "${{ github.sha }}" --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \
+            --pins benchmarks/requirements-bench.txt
 
-      - name: Run benchmarks (instruction-count)
-        uses: CodSpeedHQ/action@v4
+      - name: Upload regenerated baseline
+        if: ${{ inputs.regen }}
+        uses: actions/upload-artifact@v4
         with:
-          mode: simulation
-          run: uv run pytest benchmarks/ --codspeed -o addopts= -p no:cacheprovider
+          name: baseline-update
+          path: benchmarks/baseline.json
diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md
index c04f4801..54786083 100644
--- a/benchmarks/PLAN.md
+++ b/benchmarks/PLAN.md
@@ -122,20 +122,58 @@ benchmarks/
 ```
 One module per binding subsystem so a CodSpeed report points at one src/ area. torch/tf go in produce_numpy (wrap FetchNumpyInternal); polars stays in arrow (wraps FetchArrowTable).
 
+> **Note (reconciled to the implemented model).** The prose below originally described a per-PR CodSpeed
+> commit-diff gate. That is NOT how the suite works now. The implemented model is: **nightly `schedule` +
+> manual `workflow_dispatch`** (no per-PR trigger, no CodSpeed account/token/runner), a **self-hosted
+> `valgrind --tool=callgrind`** sweep that emits one dump per benchmark, and **`compare_baseline.py`** diffing
+> those counts against a **committed `benchmarks/baseline.json`**. See `.github/workflows/codspeed.yml`.
+
 ### Walltime vs instruction-count
 
 - **Local A/B (macOS arm64): walltime only** (no Valgrind), `--codspeed-mode=walltime`.
-- **CI gate: instruction-count / simulation (Linux + Callgrind)**, deterministic — gate PRs with this.
-
-Instruction-count is ideal AND should gate the GIL-held single-threaded overhead paths: fetchone loop, fetchall/fetchmany, native UDF per-call, native values() ingest, analyzer bind, all per-element converters (FromValue, TransformPythonValue, NumpyScan object/string, ArrayWrapper fill). The historical fetchall regression would be caught cleanly here.
-
-Noisy under instruction-count — keep walltime-only, informational, do NOT hard-gate:
-- to_arrow_table / pl() on materialized results: PromoteMaterializedToArrow re-runs the query parallel with GIL released (`pyresult.cpp:450-477`).
-- Large 1M+ SELECT sum() ingest reads: engine parallel aggregate dominates.
-- read_csv/parquet/json: engine + I/O dominated.
-- GIL-per-chunk streaming (FetchNextRaw, to_record_batch_reader drain).
-
-Gate tactic: pair each large-throughput scenario with a small/1-row variant (e.g. fetchall range(1_000_000) walltime + fetchall range(2048) instruction-count gate) so binding fixed-cost is measured noise-free.
+- **CI: instruction-count via self-hosted Callgrind (Linux)**, near-deterministic (~0.1% noise with
+  `PYTHONHASHSEED=0`; often bit-identical) — compared against the committed baseline, **report-only** for now
+  (flip `compare_baseline.py` to `--enforce` when trusted).
+
+### Marker split + committed-baseline gate (INFRA-1 / Phase-3)
+
+- Every benchmark carries exactly one of `@pytest.mark.gate` / `@pytest.mark.informational` (registered in
+  `conftest.py`). **gate** = binding-dominated, instruction-count-meaningful (fetchone loop, fetchall/fetchmany,
+  df()/fetchnumpy, native UDF per-call, native values()/executemany ingest, analyzer bind, per-element
+  converters). **informational** = engine/library/streaming-diluted, reported but never gated
+  (`to_arrow_table`/`pl()`/`to_pandas` GIL-released re-runs; registered-frame `SELECT sum()` reads;
+  streaming drains; the concurrency module).
+- **Engine floors + Option-B (MEAS-1).** `test_engine_control_perf.py` measures `SELECT sum(...) FROM range(N)`
+  with no Python egress — the engine floor. At baseline **regen**, each mapped numeric-produce gate's binding
+  fraction `= 1 - floor_Ir/bench_Ir` is computed; a gate below the ~25% cutoff is **auto-moved to
+  informational** (a threshold on an engine-diluted total is not meaningful) and the fraction is stored in
+  `baseline.json` for audit. MEAS-1 showed OUT-row fetch and UDFs are ~all binding (stay gate); numeric
+  produce (`df()`/`fetchnumpy`) is a bulk memcpy of ~engine magnitude (auto-move candidate).
+- **Small-N gates are compile+fetch fixed-cost**, not pure fetch (MEAS-1: ~60% compile+engine at `range(2048)`).
+- **Engine-bump guard.** `compare_baseline.py` compares the committed submodule SHA against the baseline's; if
+  they differ, engine-inclusive deltas may reflect the engine bump, so gate deltas are not enforced (regen the
+  baseline for the new engine).
+- **Reproducibility.** `benchmarks/requirements-bench.txt` (frozen `==` pins, from the `[dependency-groups]
+  bench` list) + `benchmarks/baseline.json` are the co-regenerated pair; CI installs the frozen pins (NOT the
+  gitignored `uv.lock`), so the only cross-run delta is the binding.
+
+Still **informational / do NOT gate** (engine/parallel/IO/library dominated):
+- to_arrow_table / pl() on materialized results (PromoteMaterializedToArrow re-runs GIL-released).
+- registered-frame `SELECT sum()` ingest reads (engine aggregate dominates).
+- read_csv/parquet/json; GIL-per-chunk streaming drains.
+
+### New coverage dimensions (beyond the converter surface)
+
+- **Concurrency/GIL** (`test_concurrency_perf.py`, informational/walltime): threads {1,4,8} over a **multi-batch**
+  arrow scan / pandas scan / native + arrow UDF. EXCLUDED from the Callgrind sweep (Callgrind serializes threads
+  → its wall-clock contention signal is meaningless there); it is a local walltime tool.
+- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain psutil RSS + object-count
+  ratio test (not a codspeed benchmark) for the object-pinning paths (register/unregister, UDF create/run/remove,
+  executemany). Runs in the normal test suite.
+- **Memory mode (DEFERRED).** A second Callgrind sweep (`--codspeed-mode=memory`) over the O(rows) produce paths
+  for peak-RSS, feeding the same baseline model, is DESIGNED but not implemented this round (roughly doubles the
+  CI cost; nightly-only when added). The `test_mem_df_with_nulls` tracemalloc guard stays as a local signal until
+  then (convert it to an A/B delta when memory mode lands).
 
 ### Two code-grounded gotchas
 - **OUT-col null benchmarks need REAL DuckDB nulls** (`CASE WHEN ... THEN NULL`): the masked-array branch only triggers on an actually-invalid validity bit (`array_wrapper.cpp:396-404,736`); a no-null column silently takes the cheap `std::move` path and measures the wrong thing.
diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py
new file mode 100644
index 00000000..b641662f
--- /dev/null
+++ b/benchmarks/_scale.py
@@ -0,0 +1,40 @@
+"""Env-gated row-count scaling for the benchmark suite (INFRA-4).
+
+Callgrind is 20-50x, and the O(rows) / per-row-object benchmarks at full N make the CI sweep too slow. `scaled(n)`
+shrinks those row counts ONLY when an explicit `BENCH_SCALE=<divisor>` env var is set (which the CI Callgrind
+sweep sets). Unset -> full N, so LOCAL walltime A/B keeps the large N unchanged.
+
+CRITICAL: a gate benchmark and the engine-control floor it is compared against (the FLOOR_MAP pairs in
+compare_baseline.py) share the same base N, so routing BOTH through `scaled()` keeps them at an identical scaled
+N -- the Option-B binding_fraction stays valid. Scaling ONLY reduces row counts; it must never change the data
+patterns the benchmarks depend on (real NULLs, mixed ASCII+non-ASCII+null, LIMIT-no-ORDER-BY, warm-before-measure).
+
+A floor keeps a scaled benchmark row-dominated (well above the range(2048) fixed-cost probes), so per-element
+work still dominates and the fraction/signal stay meaningful. The small-N `*_gate` probes are NOT routed through
+this (they are already fast and are the fixed-cost baseline).
+"""
+
+from __future__ import annotations
+
+import os
+
+FLOOR = 20_000  # a scaled bench never drops below this (stays row-dominated, ~10x the range(2048) probes)
+
+
+def bench_scale() -> int:
+    """Return the divisor from `BENCH_SCALE` (>=1); 1 (no scaling) if unset/invalid."""
+    v = os.environ.get("BENCH_SCALE")
+    if not v:
+        return 1
+    try:
+        return max(int(v), 1)
+    except ValueError:
+        return 1
+
+
+def scaled(n: int) -> int:
+    """Return `n` at full scale, or `max(n // BENCH_SCALE, min(n, FLOOR))` when scaling is enabled."""
+    d = bench_scale()
+    if d <= 1:
+        return n
+    return max(n // d, min(n, FLOOR))
diff --git a/benchmarks/compare_baseline.py b/benchmarks/compare_baseline.py
new file mode 100644
index 00000000..85e96c43
--- /dev/null
+++ b/benchmarks/compare_baseline.py
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3
+"""Committed-baseline instruction-count comparison for the CodSpeed benchmark suite.
+
+WHY / HOW (grounded, verified on a Linux+valgrind box):
+  The suite runs under `valgrind --tool=callgrind` with pytest-codspeed. pytest-codspeed's hooks call
+  `callgrind_dump_stats_at(<uri>)` at the end of each benchmark, so callgrind writes ONE dump file per
+  benchmark, headed by `desc: Trigger: Client Request: <uri>` with the instruction count on the `totals:`
+  line (`events: Ir`). The hooks also obj-skip libpython, so counts are clean. NO CodSpeed account, token, or
+  runner binary is involved -- this parses the raw callgrind dumps directly.
+
+  Observed run-to-run noise on that box was ~0.1% (callgrind is near-deterministic, not bit-identical), so the
+  default gate threshold (5%) sits far above noise. PYTHONHASHSEED is pinned in CI to keep dict/struct paths
+  stable.
+
+TWO MODES:
+  regen   -- build benchmarks/baseline.json from a fresh valgrind run: per-benchmark instruction counts +
+             provenance meta + (for the mapped numeric-produce gates) the engine-diluted binding fraction, and
+             the Option-B auto-move of any gate below the cutoff to `informational`.
+  compare -- parse a fresh valgrind run, diff each benchmark against baseline.json, and print a report. GATE
+             benchmarks over their threshold are regressions; `informational` benchmarks are reported only.
+             REPORT-ONLY by default (always exit 0); `--enforce` exits non-zero on a gate regression.
+
+Both are CI-only in practice (no valgrind on macOS arm64). baseline.json and benchmarks/requirements-bench.txt
+are regenerated together (same job) so the counts always correspond to the frozen data-lib pins.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+SCHEMA_VERSION = 1
+GATE_DEFAULT_THRESHOLD_PCT = 5.0
+BINDING_FRACTION_CUTOFF = 0.25  # Option-B: a gate whose isolable binding fraction is below this is auto-moved
+#                                 to informational (a threshold on its engine-diluted total is not meaningful).
+
+# Option-B floor map: the engine-control benchmark whose instruction count is the "engine floor" of a given
+# numeric-produce gate. binding_fraction = 1 - floor_Ir / bench_Ir. ONLY the numeric-produce benches are listed:
+# MEAS-1 showed their per-element binding is a bulk memcpy (~engine magnitude); every other gate (OUT-row fetch
+# of any type, string/nested/decimal/hugeint/uuid produce, UDFs, native ingest, analyzer bind) is high-binding
+# and needs no fraction. Add a mapping (and, if needed, an engine floor) here to evaluate more benches.
+_E = "benchmarks/test_engine_control_perf.py"
+FLOOR_MAP = {
+    "benchmarks/test_produce_numpy_perf.py::test_df_numeric": f"{_E}::test_engine_sum_2col_500k",
+    "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": f"{_E}::test_engine_sum_2col_500k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": f"{_E}::test_engine_sum_1col_100k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": f"{_E}::test_engine_sum_1col_100k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": f"{_E}::test_engine_sum_1col_100k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": f"{_E}::test_engine_sum_1col_100k",
+}
+
+_TRIGGER_RE = re.compile(r"^desc:\s*Trigger:\s*Client Request:\s*(?P<uri>.+?)\s*$")
+_TOTALS_RE = re.compile(r"^totals:\s*(?P<ir>\d+)\s*$")
+
+
+# --------------------------------------------------------------------------- #
+# callgrind parsing
+# --------------------------------------------------------------------------- #
+
+
+def _normalize_uri(raw: str) -> str:
+    """Return a repo-relative benchmark key.
+
+    Inside a git repo pytest-codspeed already emits a git-relative uri (e.g. `benchmarks/x.py::test[p]`); this
+    defensively strips a leading absolute path if the run happened outside a git repo.
+    """
+    raw = raw.strip()
+    if "::" not in raw:
+        return raw
+    path, _, rest = raw.partition("::")
+    idx = path.find("benchmarks/")
+    if idx > 0:
+        path = path[idx:]
+    return f"{path}::{rest}"
+
+
+def parse_profiles(profile_dir: Path) -> dict[str, int]:
+    """Parse every callgrind dump in `profile_dir`; return {benchmark_uri: instruction_count}.
+
+    Only dumps whose Trigger is a benchmark Client Request (contains `::`) are kept; the metadata and
+    program-termination dumps are skipped. If a uri appears more than once (should not happen) the max is kept.
+    """
+    counts: dict[str, int] = {}
+    files = sorted(profile_dir.rglob("*")) if profile_dir.exists() else []
+    for f in files:
+        if not f.is_file():
+            continue
+        uri: str | None = None
+        ir: int | None = None
+        try:
+            text = f.read_text(errors="replace")
+        except (OSError, UnicodeError):
+            continue
+        for line in text.splitlines():
+            m = _TRIGGER_RE.match(line)
+            if m:
+                uri = _normalize_uri(m.group("uri"))
+                continue
+            m = _TOTALS_RE.match(line)
+            if m:
+                ir = int(m.group("ir"))
+        if uri and "::" in uri and ir is not None:
+            counts[uri] = max(counts.get(uri, 0), ir)
+    return counts
+
+
+# --------------------------------------------------------------------------- #
+# helpers
+# --------------------------------------------------------------------------- #
+
+
+def _sha256(path: Path) -> str:
+    return hashlib.sha256(path.read_bytes()).hexdigest() if path.exists() else ""
+
+
+def _load_gate_set(gate_list: Path | None) -> set[str]:
+    """Load the set of gate benchmark uris from a `pytest -m gate --collect-only -q` node-id list."""
+    if not gate_list or not gate_list.exists():
+        return set()
+    out = set()
+    for raw in gate_list.read_text().splitlines():
+        line = raw.strip()
+        if "::" in line:  # a pytest node-id (the workflow pre-filters the collect-only output to '::' lines)
+            out.add(_normalize_uri(line))
+    return out
+
+
+def _pct(base: int, new: int) -> float:
+    return 0.0 if base == 0 else (new - base) / base * 100.0
+
+
+# --------------------------------------------------------------------------- #
+# regen
+# --------------------------------------------------------------------------- #
+
+
+def regen(args: argparse.Namespace) -> int:
+    """Write baseline.json from a valgrind run: counts + provenance + Option-B binding fractions/auto-move."""
+    counts = parse_profiles(Path(args.profiles))
+    if not counts:
+        print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr)
+        return 2
+    gate_set = _load_gate_set(Path(args.gate_list) if args.gate_list else None)
+
+    benches: dict[str, dict] = {}
+    auto_moved: list[str] = []
+    for uri, ir in sorted(counts.items()):
+        source_marker = "gate" if uri in gate_set else "informational"
+        marker = source_marker
+        binding_fraction = None
+        floor_uri = FLOOR_MAP.get(uri)
+        if source_marker == "gate" and floor_uri and floor_uri in counts and ir > 0:
+            binding_fraction = round(max(0.0, 1.0 - counts[floor_uri] / ir), 4)
+            if binding_fraction < args.cutoff:
+                marker = "informational"  # Option-B auto-move: engine-diluted, threshold not meaningful
+                auto_moved.append(uri)
+        benches[uri] = {
+            "marker": marker,
+            "source_marker": source_marker,
+            "auto_moved": marker != source_marker,
+            "instructions": ir,
+            "binding_fraction": binding_fraction,
+            "threshold_pct": GATE_DEFAULT_THRESHOLD_PCT if marker == "gate" else None,
+        }
+
+    baseline = {
+        "meta": {
+            "schema_version": SCHEMA_VERSION,
+            "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
+            "git_commit": args.git_commit,
+            "duckdb_submodule_sha": args.submodule_sha,
+            "requirements_bench_sha256": _sha256(Path(args.pins)) if args.pins else "",
+            "measurement": {"tool": "valgrind callgrind", "event": "Ir", "pythonhashseed": "0"},
+            "bench_scale": os.environ.get("BENCH_SCALE", ""),  # counts are only comparable at the same scale
+            "gate_default_threshold_pct": GATE_DEFAULT_THRESHOLD_PCT,
+            "binding_fraction_cutoff": args.cutoff,
+            "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above.",
+        },
+        "benchmarks": benches,
+    }
+    Path(args.out).write_text(json.dumps(baseline, indent=2) + "\n")
+    n_gate = sum(1 for b in benches.values() if b["marker"] == "gate")
+    n_info = len(benches) - n_gate
+    print(f"Wrote {args.out}: {len(benches)} benchmarks ({n_gate} gate, {n_info} informational).")
+    if auto_moved:
+        print(f"Option-B auto-moved {len(auto_moved)} engine-diluted gate(s) to informational:")
+        for uri in auto_moved:
+            print(f"  {uri}  (binding_fraction={benches[uri]['binding_fraction']})")
+        print("Recommend updating these benches' @pytest.mark.gate -> informational so code matches the baseline.")
+    return 0
+
+
+# --------------------------------------------------------------------------- #
+# compare
+# --------------------------------------------------------------------------- #
+
+
+def compare(args: argparse.Namespace) -> int:
+    """Diff a fresh valgrind run against baseline.json and print a report (report-only unless --enforce)."""
+    new_counts = parse_profiles(Path(args.profiles))
+    if not new_counts:
+        print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr)
+        return 2
+    baseline_path = Path(args.baseline)
+    if not baseline_path.exists():
+        # Bootstrap state: no committed baseline yet. Report the run and instruct to regenerate; never fail.
+        print(f"No baseline at {baseline_path} yet -- run the workflow with regen=true to create it.")
+        print(f"This run produced {len(new_counts)} benchmark instruction counts.")
+        return 0
+    baseline = json.loads(baseline_path.read_text())
+    meta = baseline.get("meta", {})
+    base_benches = baseline.get("benchmarks", {})
+
+    # scale guard: a baseline built at BENCH_SCALE=X is only comparable to a run at the same scale.
+    run_scale = os.environ.get("BENCH_SCALE", "")
+    base_scale = meta.get("bench_scale", "")
+    if run_scale != base_scale:
+        print(
+            f"WARNING: BENCH_SCALE differs (run={run_scale!r}, baseline={base_scale!r}) -> instruction counts are "
+            "not comparable. Regenerate the baseline at this scale."
+        )
+
+    # pin-drift guard: the baseline's counts only compare cleanly against the pinned data libs it was built with.
+    if args.pins:
+        cur = _sha256(Path(args.pins))
+        base_pins = meta.get("requirements_bench_sha256", "")
+        if cur and base_pins and cur != base_pins:
+            print(
+                "WARNING: benchmarks/requirements-bench.txt differs from the baseline's pins -> data-lib deltas "
+                "may not be pure binding. Regenerate the baseline with the current pins."
+            )
+
+    # engine-bump guard: engine-inclusive counts shift when the bundled DuckDB submodule changes, for reasons
+    # unrelated to the binding. If the current submodule SHA differs from the baseline's, do not treat gate
+    # deltas as hard failures (they may reflect the engine bump); warn to regenerate the baseline.
+    engine_changed = bool(
+        args.submodule_sha and meta.get("duckdb_submodule_sha") and args.submodule_sha != meta["duckdb_submodule_sha"]
+    )
+
+    regressions: list[str] = []
+    rows: list[tuple[str, str, str]] = []  # (status, uri, detail)
+    for uri, ir in sorted(new_counts.items()):
+        b = base_benches.get(uri)
+        if b is None:
+            rows.append(("NEW", uri, f"{ir} Ir (no baseline)"))
+            continue
+        base_ir = b["instructions"]
+        delta = _pct(base_ir, ir)
+        marker = b.get("marker", "informational")
+        thr = b.get("threshold_pct") or GATE_DEFAULT_THRESHOLD_PCT
+        detail = f"{base_ir} -> {ir} Ir  ({delta:+.2f}%, thr {thr:.1f}%, {marker})"
+        if marker == "gate" and delta > thr:
+            if engine_changed:
+                rows.append(("ENGINE?", uri, detail + "  [submodule changed -> not enforced]"))
+            else:
+                rows.append(("REGRESSION", uri, detail))
+                regressions.append(uri)
+        else:
+            rows.append(("ok" if marker == "gate" else "info", uri, detail))
+    rows.extend(
+        ("MISSING", uri, "in baseline, absent from run (rename/removal?)")
+        for uri in sorted(set(base_benches) - set(new_counts))
+    )
+
+    _print_report(meta, rows, engine_changed=engine_changed, enforce=args.enforce)
+
+    if not args.enforce:
+        return 0
+    if engine_changed:
+        print("\nNOT ENFORCING: DuckDB submodule differs from the baseline; regenerate the baseline.")
+        return 0
+    return 1 if regressions else 0
+
+
+def _print_report(meta: dict, rows: list[tuple[str, str, str]], *, engine_changed: bool, enforce: bool) -> None:
+    mode = "ENFORCING" if enforce else "REPORT-ONLY (not failing the job)"
+    print("=" * 100)
+    print(f"CodSpeed instruction-count baseline comparison  [{mode}]")
+    print(
+        f"baseline: commit {meta.get('git_commit', '?')[:12]}  submodule {str(meta.get('duckdb_submodule_sha'))[:12]}"
+        f"  generated {meta.get('generated_at_utc', '?')}"
+    )
+    if engine_changed:
+        print(
+            "WARNING: DuckDB submodule SHA differs from the baseline -> engine-inclusive deltas may reflect the "
+            "engine bump, not the binding. Regenerate the baseline for this engine."
+        )
+    print("=" * 100)
+    order = {"REGRESSION": 0, "ENGINE?": 1, "MISSING": 2, "NEW": 3, "ok": 4, "info": 5}
+    for status, uri, detail in sorted(rows, key=lambda r: (order.get(r[0], 9), r[1])):
+        print(f"  [{status:>10}] {uri}\n               {detail}")
+    n_reg = sum(1 for s, _, _ in rows if s == "REGRESSION")
+    print("-" * 100)
+    print(f"Summary: {len(rows)} benchmarks, {n_reg} gate regression(s)" + ("" if enforce else "  (report-only)"))
+
+
+# --------------------------------------------------------------------------- #
+# cli
+# --------------------------------------------------------------------------- #
+
+
+def main(argv: list[str] | None = None) -> int:
+    """CLI entry point: dispatch to the `regen` or `compare` subcommand."""
+    p = argparse.ArgumentParser(description=__doc__)
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    r = sub.add_parser("regen", help="write baseline.json from a valgrind run")
+    r.add_argument("--profiles", required=True, help="CODSPEED_PROFILE_FOLDER with callgrind dumps")
+    r.add_argument("--out", default="benchmarks/baseline.json")
+    r.add_argument("--gate-list", help="file of gate node-ids (pytest -m gate --collect-only -q)")
+    r.add_argument("--git-commit", default="")
+    r.add_argument("--submodule-sha", default="")
+    r.add_argument("--pins", default="benchmarks/requirements-bench.txt")
+    r.add_argument("--cutoff", type=float, default=BINDING_FRACTION_CUTOFF)
+    r.set_defaults(func=regen)
+
+    c = sub.add_parser("compare", help="compare a valgrind run against baseline.json")
+    c.add_argument("--profiles", required=True)
+    c.add_argument("--baseline", default="benchmarks/baseline.json")
+    c.add_argument("--submodule-sha", default="")
+    c.add_argument(
+        "--pins", default="benchmarks/requirements-bench.txt", help="warn if pins differ from the baseline's"
+    )
+    c.add_argument("--enforce", action="store_true", help="exit non-zero on a gate regression (default: report-only)")
+    c.set_defaults(func=compare)
+
+    args = p.parse_args(argv)
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
new file mode 100644
index 00000000..b1ccd604
--- /dev/null
+++ b/benchmarks/conftest.py
@@ -0,0 +1,63 @@
+"""Shared fixtures + marker registration for the CodSpeed benchmark suite.
+
+Central home (INFRA-6) for the `con` fixture, the `threads=1` isolation default, and the gate/informational
+marker registration (INFRA-1). Markers are registered HERE (not via pyproject `markers=`) to keep the suite
+self-contained. Registration is REQUIRED: pyproject sets `filterwarnings = ["error"]`, so an unregistered
+mark would raise `PytestUnknownMarkWarning` as a collection error.
+
+Marker semantics
+  gate          Binding-dominated, GIL-held, deterministic under Callgrind (instruction-count). These are the
+                paths where a threshold breach means a *binding* regression. Gate-able. (Enforcement against a
+                committed baseline is a later phase; for now they run and report.)
+  informational Engine/parallel/IO/library-diluted, streaming drains, or arrow-export re-run paths. Reported,
+                never gated: their instruction count is dominated by non-binding work (engine aggregate, the
+                bundled DuckDB submodule, pyarrow/polars library code), so gating them would false-positive on
+                engine/submodule bumps rather than catch binding regressions.
+
+Every benchmark (a test using the `benchmark` fixture) must carry EXACTLY ONE of these markers so the two CI
+steps (`-m gate`, `-m informational`) together cover the suite with no overlap. Non-benchmark guards (e.g. the
+tracemalloc assertion in test_produce_numpy_perf.py) are intentionally left unmarked and run in neither step.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import bench_scale, scaled  # noqa: F401  (re-exported here as the shared home; used by the modules)
+
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+
+# ENV-GATED ROW COUNTS (INFRA-4): the O(rows) / per-row-object benchmarks route their N through `scaled()`
+# (benchmarks/_scale.py). Unset `BENCH_SCALE` -> full N (local walltime A/B is unchanged); the CI Callgrind
+# sweep sets `BENCH_SCALE=<divisor>` to shrink N so the sweep fits under the job timeout. A gate benchmark and
+# its engine-control floor (FLOOR_MAP in compare_baseline.py) share a base N, so both scale identically and the
+# Option-B binding fraction stays valid. Scaling changes ONLY row counts, never the Do-NOT-regress data patterns.
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the gate/informational markers (required under filterwarnings=error)."""
+    config.addinivalue_line(
+        "markers",
+        "gate: binding-dominated, instruction-count gate-able under Callgrind (deterministic).",
+    )
+    config.addinivalue_line(
+        "markers",
+        "informational: engine/library-diluted or streaming; reported, never gated.",
+    )
+
+
+@pytest.fixture
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh single-threaded connection, closed on teardown.
+
+    `threads=1` pins engine parallelism so per-run instruction counts and walltime do not shift with the CI
+    runner core count (INFRA-6). The concurrency module (COV-1, a later phase) overrides this deliberately.
+    """
+    c = duckdb.connect(config={"threads": 1})
+    yield c
+    c.close()
diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt
new file mode 100644
index 00000000..e230d715
--- /dev/null
+++ b/benchmarks/requirements-bench.txt
@@ -0,0 +1,29 @@
+# Frozen, exact pins for the CodSpeed benchmark suite (.github/workflows/codspeed.yml).
+#
+# WHY a committed pin file (and NOT the gitignored uv.lock, and NOT a re-resolving `>=` group): CodSpeed
+# compares instruction counts across runs. If a data lib (numpy/pandas/pyarrow/polars) changed version between
+# the baseline run and a later run, that delta would be misattributed to the binding. These pins freeze the data
+# libs so the ONLY cross-run delta is the binding. Regenerate this file DELIBERATELY, together with the baseline.
+#
+# SOURCE OF TRUTH: the human-readable `[dependency-groups] bench` list in pyproject.toml. Regenerate with:
+#   uv pip compile pyproject.toml --group bench \
+#     --python-version 3.13 --python-platform x86_64-unknown-linux-gnu \
+#     --no-annotate --no-header -o benchmarks/requirements-bench.txt
+# (py3.13 / linux-x86_64 is the CI target.) torch/tensorflow are deliberately absent (local-only via importorskip).
+iniconfig==2.3.0
+markdown-it-py==4.2.0
+mdurl==0.1.2
+numpy==2.5.0
+packaging==26.2
+pandas==3.0.3
+pluggy==1.6.0
+polars==1.42.1
+polars-runtime-32==1.42.1
+pyarrow==24.0.0
+pygments==2.20.0
+pytest==9.1.1
+pytest-codspeed==5.0.3
+python-dateutil==2.9.0.post0
+pytz==2026.2
+rich==15.0.0
+six==1.17.0
diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py
index 244663bc..0fd8461f 100644
--- a/benchmarks/test_arrow_perf.py
+++ b/benchmarks/test_arrow_perf.py
@@ -17,25 +17,26 @@
 
 import pyarrow as pa
 import pytest
+from _scale import scaled
 
-import duckdb
+import numpy as np
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
-N = 500_000
-WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)"
-WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)"
+    import duckdb
 
+N = scaled(500_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
+DICT_UNIQUE = [2, 1_000, 50_000]  # cardinality sweep: UNIQUE-value counts (not row counts) -> NOT scaled
+WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
+WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
+# informational: every benchmark here is engine-parallel or library/streaming dominated -> reported, not gated.
+#   READ (sum over registered arrow) -> engine aggregate dominates; the near-zero-copy scan is a small fraction.
+#   WRITE to_arrow_table/to_arrow_reader/pl() -> PromoteMaterializedToArrow re-runs the query GIL-released
+#   (engine-parallel), and pl() also runs polars library code. Their counts would trip on engine/submodule
+#   bumps, not binding regressions. `con` fixture + threads=1 live in conftest.py.
+pytestmark = pytest.mark.informational
 
 
 @pytest.fixture(scope="module")
@@ -62,6 +63,18 @@ def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.R
     return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000)
 
 
+@pytest.fixture(scope="module")
+def arrow_dict_tables() -> dict[int, pa.Table]:
+    """Return dictionary-encoded arrow tables keyed by number of unique values (a cardinality sweep)."""
+    # deterministic indices (i % U) so the instruction count is reproducible (no PRNG)
+    tables = {}
+    for u in DICT_UNIQUE:
+        uniques = pa.array([f"category_value_{i}" for i in range(u)], type=pa.string())
+        idx = pa.array(np.arange(N, dtype="int32") % u, type=pa.int32())
+        tables[u] = pa.table({"c": pa.DictionaryArray.from_arrays(idx, uniques)})
+    return tables
+
+
 # --------------------------------------------------------------------------- #
 # READ: arrow -> duckdb. The engine must scan every value (sum/length force it).
 # --------------------------------------------------------------------------- #
@@ -72,12 +85,14 @@ def test_read_arrow_numeric(
 ) -> None:
     """Benchmark scanning a numeric arrow table."""
     con.register("t_num", arrow_numeric)
+    con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall())
 
 
 def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None:
     """Benchmark scanning a string arrow table."""
     con.register("t_str", arrow_string)
+    con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall())
 
 
@@ -103,6 +118,21 @@ def run() -> list:
     benchmark(run)
 
 
+# ADDED (COV-4): dictionary-encoded arrow ingest, cardinality sweep (unique in {2, 1k, high}). Mirrors core's
+# test_arrow_dictionaries_scan. The engine aggregate dominates (hence informational), but the per-value
+# dictionary DECODE in the arrow scan is the binding interest, and its cost slopes with the unique count.
+
+
+@pytest.mark.parametrize("unique", DICT_UNIQUE)
+def test_read_arrow_dictionary(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_dict_tables: dict[int, pa.Table], unique: int
+) -> None:
+    """Benchmark scanning a dictionary-encoded arrow column at a given cardinality."""
+    con.register("t_dict", arrow_dict_tables[unique])
+    con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall())
+
+
 # --------------------------------------------------------------------------- #
 # WRITE: duckdb -> arrow, consumer fully materializes / fully drains the stream.
 # --------------------------------------------------------------------------- #
diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py
index bf49dfc1..6e7af136 100644
--- a/benchmarks/test_cardinality_perf.py
+++ b/benchmarks/test_cardinality_perf.py
@@ -18,6 +18,7 @@
 from typing import TYPE_CHECKING
 
 import pytest
+from _scale import scaled
 
 import duckdb
 
@@ -26,8 +27,10 @@
 
     from pytest_codspeed import BenchmarkFixture
 
-SRC_ROWS = 200_000
-LIMITS = [100, 1_000, 10_000, 100_000]
+# env-gated (INFRA-4): scale the source rows AND the top-N of the sweep by the same factor, keeping the small-N
+# points fixed and SRC_ROWS >= max(LIMITS). Preserves the LIMIT-no-ORDER-BY early-stop pattern (Do-NOT-regress).
+SRC_ROWS = scaled(200_000)
+LIMITS = [100, 1_000, 10_000, scaled(100_000)]
 
 
 @pytest.fixture(scope="module")
@@ -35,7 +38,7 @@ def con() -> Iterator[duckdb.DuckDBPyConnection]:
     """Yield a connection over a once-materialized source table."""
     # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be
     # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows.
-    c = duckdb.connect()
+    c = duckdb.connect(config={"threads": 1})  # pin engine parallelism (INFRA-6); module-scoped source table
     c.execute(
         "CREATE TABLE src AS "
         f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)"
@@ -50,6 +53,7 @@ def _query(n: int) -> str:
     return f"SELECT a, b, s FROM src LIMIT {n}"
 
 
+@pytest.mark.gate  # fetchall materializes n rows to Python -> binding-dominated; small-n end is the noise-free gate
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
     """Benchmark fetchall over a LIMIT n sweep."""
@@ -58,6 +62,7 @@ def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnect
     benchmark(lambda: con.execute(q).fetchall())
 
 
+@pytest.mark.gate  # df() materializes n rows to numpy columns -> binding-dominated
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
     """Benchmark df() over a LIMIT n sweep."""
@@ -66,6 +71,7 @@ def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n
     benchmark(lambda: con.sql(q).df())
 
 
+@pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
     """Benchmark to_arrow_table() over a LIMIT n sweep."""
diff --git a/benchmarks/test_concurrency_perf.py b/benchmarks/test_concurrency_perf.py
new file mode 100644
index 00000000..8be28619
--- /dev/null
+++ b/benchmarks/test_concurrency_perf.py
@@ -0,0 +1,136 @@
+"""CodSpeed benchmark: concurrency / GIL pressure (COV-1). informational / WALLTIME. Standalone, not gated.
+
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_concurrency_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+This is the ONE dimension the rest of the suite (single-threaded) cannot see: Python objects threading through
+PARALLEL core execution. It varies `SET threads` and measures the binding under parallel scan / parallel UDF
+invocation. All benchmarks are `informational` and their PRIMARY signal is LOCAL WALLTIME:
+  * scan benches           -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup.
+  * native UDF             -> ~flat scaling = the GIL tax on per-row Python calls (the engine scan is parallel
+                              but the GIL serializes the calls).
+  * arrow (vectorized) UDF -> observed NEGATIVE scaling (slower with more threads): per-chunk convert + GIL
+                              contention. A regression here would deepen the negative slope.
+
+Under the CI `-m informational` step these run in `simulation` (Callgrind), which SERIALIZES threads -- so the
+wall-clock contention is NOT visible there; instead the deterministic instruction count captures the per-batch
+Produce GIL calls and the UDF dispatch overhead. Never gated either way.
+
+GOTCHA (verified locally, mirrors the suite's other "measure the right thing" traps): a SINGLE-BATCH arrow table
+does NOT parallelize (one batch = one serial scan unit; flat across threads). The arrow scan bench MUST use a
+MULTI-BATCH table (`from_batches` with a modest chunksize) or it silently measures a serial scan. A CPU-heavy
+aggregate is also required: a cheap sum is memory-bandwidth-bound and will not parallelize, so there is nothing
+to contend on.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+import duckdb
+from duckdb.sqltypes import BIGINT
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+pa = pytest.importorskip("pyarrow")
+pc = pytest.importorskip("pyarrow.compute")
+import numpy as np  # noqa: E402  (after importorskip, matching the suite convention)
+import pandas as pd  # noqa: E402
+
+# informational: concurrency benchmarks are never gated (walltime-noisy; under Callgrind, thread-serialized).
+pytestmark = pytest.mark.informational
+
+N_SCAN = 1_000_000
+BATCH = 20_000  # -> 50 record batches; MULTI-BATCH is required for the arrow scan to parallelize (see GOTCHA)
+N_UDF_NATIVE = 200_000  # native UDF = one Python call per row; keep modest (Callgrind instruments every call)
+N_UDF_ARROW = 1_000_000  # arrow UDF = one call per chunk (vectorized)
+THREADS = [1, 4, 8]
+
+# CPU-heavy aggregate so the parallel scan actually engages worker threads (a cheap sum is bandwidth-bound and
+# would not parallelize -> no contention to measure). The binding signal is the per-batch Produce GIL handoff.
+HEAVY = "sin(a) * cos(b) + sqrt(abs(a)) + ln(abs(a) + 1)"
+
+
+@pytest.fixture(scope="module")
+def arrow_multibatch() -> pa.Table:
+    """Return a MULTI-batch arrow table (single-batch would scan serially -- see module GOTCHA)."""
+    a = pa.array(np.arange(N_SCAN), type=pa.int64())
+    b = pa.array(np.arange(N_SCAN, dtype="float64") * 1.5, type=pa.float64())
+    return pa.Table.from_batches(pa.table({"a": a, "b": b}).to_batches(max_chunksize=BATCH))
+
+
+@pytest.fixture(scope="module")
+def pandas_frame() -> pd.DataFrame:
+    """Return a numpy-backed pandas frame (its scan parallelizes across worker threads)."""
+    return pd.DataFrame({"a": np.arange(N_SCAN), "b": np.arange(N_SCAN, dtype="float64") * 1.5})
+
+
+# --------------------------------------------------------------------------- #
+# Parallel SCAN: Python objects (arrow batches / pandas chunks) pulled through the binding by engine worker
+# threads under a CPU-heavy aggregate. The scan Produce acquires/releases the GIL per batch across threads.
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.Table, threads: int) -> None:
+    """Benchmark a parallel aggregate pulling arrow batches across threads."""
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.register("t", arrow_multibatch)
+        q = f"SELECT sum({HEAVY}) FROM t"
+        con.execute(q).fetchall()  # warm
+        benchmark(lambda: con.execute(q).fetchall())
+    finally:
+        con.close()
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.DataFrame, threads: int) -> None:
+    """Benchmark a parallel aggregate pulling pandas chunks across threads."""
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.register("t", pandas_frame)
+        q = f"SELECT sum({HEAVY}) FROM t"
+        con.execute(q).fetchall()  # warm
+        benchmark(lambda: con.execute(q).fetchall())
+    finally:
+        con.close()
+
+
+# --------------------------------------------------------------------------- #
+# Parallel UDF: the engine scans a MATERIALIZED table (range() does not parallelize) and invokes a Python UDF
+# from multiple worker threads. Native = per-row Python call under the GIL (GIL tax); arrow = per-chunk convert.
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
+    """Benchmark a native Python UDF invoked from parallel worker threads (GIL tax)."""
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_NATIVE}) s(i)")  # materialized -> parallel scan
+        con.create_function("pyf", lambda x: (x * 2 + 1) % 97, [BIGINT], BIGINT)
+        con.execute("SELECT sum(pyf(a)) FROM t").fetchall()  # warm
+        benchmark(lambda: con.execute("SELECT sum(pyf(a)) FROM t").fetchall())
+    finally:
+        con.close()
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_udf_arrow_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
+    """Benchmark a vectorized arrow UDF invoked from parallel worker threads."""
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_ARROW}) s(i)")  # materialized -> parallel scan
+        con.create_function("af", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+        con.execute("SELECT sum(af(a)) FROM t").fetchall()  # warm
+        benchmark(lambda: con.execute("SELECT sum(af(a)) FROM t").fetchall())
+    finally:
+        con.close()
diff --git a/benchmarks/test_engine_control_perf.py b/benchmarks/test_engine_control_perf.py
new file mode 100644
index 00000000..febd7ba4
--- /dev/null
+++ b/benchmarks/test_engine_control_perf.py
@@ -0,0 +1,68 @@
+"""CodSpeed benchmark: pure-ENGINE control (no Python egress). Standalone, not in CI's binding gate.
+
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_engine_control_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+These `SELECT sum(...) FROM range(N)` queries aggregate to a single scalar, so the fetchall of the result is
+negligible: they measure SQL compile + the engine aggregate with (almost) ZERO per-row Python egress. They are
+the "engine floor" reference for MEAS-1: comparing a produce/fetch/ingest benchmark against the matching-N floor
+here quantifies how much of that benchmark's cost is the binding vs the engine. They are `informational` (they
+measure the engine, not the binding, so they must never gate).
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+# informational: pure-engine reference, never gated. `con` fixture + threads=1 live in conftest.py.
+pytestmark = pytest.mark.informational
+
+# Matched to the N of the fetch/produce/ingest/udf benchmarks so the floors line up for MEAS-1 subtraction and,
+# at baseline regen, for the Option-B binding-fraction of the numeric-produce gates (see compare_baseline.py).
+# CRITICAL: these floors go through scaled() with the SAME base N as the benchmarks they floor, so under
+# BENCH_SCALE the floor and its benchmark stay at an identical N and the fraction stays valid. The 2048 small-N
+# floor is NOT scaled (it is the fixed-cost baseline for the *_gate probes).
+Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)"  # small-N gate floor (compile-dominated), NOT scaled
+Q_1C_100K = f"SELECT sum(i::BIGINT) FROM range({scaled(100_000)}) t(i)"  # types-matrix numeric-df floor
+Q_1C_200K = f"SELECT sum(i::BIGINT) FROM range({scaled(200_000)}) t(i)"  # fetch / native-UDF floor
+# produce/ingest floor
+Q_2C_500K = (
+    f"SELECT sum(a), sum(b) FROM (SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({scaled(500_000)}) t(i))"
+)
+
+
+def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
+    con.execute(query).fetchall()  # warm
+    benchmark(lambda: con.execute(query).fetchall())
+
+
+def test_engine_sum_1col_small(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Engine floor: compile + sum over range(2048), no egress."""
+    _bench(benchmark, con, Q_1C_SMALL)
+
+
+def test_engine_sum_1col_100k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Engine floor: compile + sum over range(100k), no egress."""
+    _bench(benchmark, con, Q_1C_100K)
+
+
+def test_engine_sum_1col_200k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Engine floor: compile + sum over range(200k), no egress."""
+    _bench(benchmark, con, Q_1C_200K)
+
+
+def test_engine_sum_2col_500k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    """Engine floor: compile + 2-col sum over range(500k), no egress."""
+    _bench(benchmark, con, Q_2C_500K)
diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py
index 94a53c30..9820db6d 100644
--- a/benchmarks/test_fetch_perf.py
+++ b/benchmarks/test_fetch_perf.py
@@ -16,21 +16,23 @@
 from typing import TYPE_CHECKING
 
 import pytest
-
-import duckdb
+from _scale import scaled
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
+    import duckdb
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
+# gate: OUT-row fetch fully materializes every row to Python -> binding-dominated, GIL-held; the engine side is
+# a cheap range() scan. Deterministic under Callgrind -> instruction-count gate-able. (The small-N *_gate tests
+# are the compile+fetch fixed-cost variants; see MEAS-1.) The `con` fixture + threads=1 live in conftest.py.
+pytestmark = pytest.mark.gate
+
+# env-gated row counts (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. The 2048
+# small-N *_gate probes are intentionally NOT scaled (they are the compile+fetch fixed-cost baseline).
+N_ROW = scaled(200_000)  # per-row-object numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128)
+N_STR = scaled(100_000)  # varchar/blob/mixed-wide/timestamptz + fetchone/fetchmany loops
+N_NEST = scaled(50_000)  # heterogeneous scalar/list/struct row
 
 
 def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
@@ -40,41 +42,41 @@ def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection,
 
 def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a single BIGINT column."""
-    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(200000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a single INTEGER column."""
-    _bench_fetchall(benchmark, con, "SELECT (i % 100)::INTEGER AS a FROM range(200000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT (i % 100)::INTEGER AS a FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a single DOUBLE column."""
-    _bench_fetchall(benchmark, con, "SELECT (i * 1.5)::DOUBLE AS a FROM range(200000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT (i * 1.5)::DOUBLE AS a FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of two BIGINT columns."""
-    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(200000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a single VARCHAR column."""
-    _bench_fetchall(benchmark, con, "SELECT ('str_value_' || i) AS s FROM range(100000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT ('str_value_' || i) AS s FROM range({N_STR}) t(i)")
 
 
 def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a mixed scalar/list/struct row."""
     query = (
         "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, "
-        "{'a': i, 'b': i + 1} AS st FROM range(50000) t(i)"
+        f"{{'a': i, 'b': i + 1}} AS st FROM range({N_NEST}) t(i)"
     )
     _bench_fetchall(benchmark, con, query)
 
 
 def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark iterating a result one row at a time with fetchone."""
-    query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)"
+    query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)"
 
     def run() -> None:
         rel = con.execute(query)
@@ -85,43 +87,45 @@ def run() -> None:
 
 
 # --------------------------------------------------------------------------- #
-# ADDED: small-N instruction-count-gate variants (the narrow-numeric fixed-cost path, noise-free at range(2048)
-# under simulation mode in CI), expensive scalar OUT-row types (timestamptz pytz-per-row, blob, null-heavy), a
-# heterogeneous per-cell-dispatch row (hugeint+uuid+decimal128+varchar, distinct from homogeneous columns), and
-# the batched fetchmany loop.
+# small-N COMPILE+FETCH FIXED-COST variants: at range(2048) the measured region is dominated by SQL front-end
+# compilation + the engine, NOT fetch. MEAS-1 walltime split (vs the range(2048) engine floor in
+# test_engine_control_perf.py): ~40% fetch fixed-cost, ~60% compile+engine. They still catch a fixed-cost
+# regression, but they are compile+fetch fixed-cost gates, not pure-fetch gates. Plus expensive scalar OUT-row
+# types (timestamptz pytz-per-row, blob, null-heavy), a heterogeneous per-cell-dispatch row
+# (hugeint+uuid+decimal128+varchar, distinct from the homogeneous columns), and the batched fetchmany loop.
 # --------------------------------------------------------------------------- #
 
 
 def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark the small-N BIGINT instruction-count gate."""
+    """Benchmark the small-N BIGINT compile+fetch fixed-cost (MEAS-1: ~60% compile+engine, ~40% fetch)."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)")
 
 
 def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark the small-N two-BIGINT instruction-count gate."""
+    """Benchmark the small-N two-BIGINT compile+fetch fixed-cost."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)")
 
 
 def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a half-NULL BIGINT column."""
-    _bench_fetchall(benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a TIMESTAMPTZ column."""
     _bench_fetchall(
-        benchmark, con, "SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range(100000) t(i)"
+        benchmark, con, f"SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range({N_STR}) t(i)"
     )
 
 
 def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a 128-bit DECIMAL column."""
-    _bench_fetchall(benchmark, con, "SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range(200000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchall of a BLOB column."""
-    _bench_fetchall(benchmark, con, "SELECT ('blob_value_' || i)::BLOB FROM range(100000) t(i)")
+    _bench_fetchall(benchmark, con, f"SELECT ('blob_value_' || i)::BLOB FROM range({N_STR}) t(i)")
 
 
 def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
@@ -130,14 +134,14 @@ def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyCo
     # from the homogeneous single-type columns above)
     query = (
         "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, "
-        "((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range(100000) t(i)"
+        f"((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range({N_STR}) t(i)"
     )
     _bench_fetchall(benchmark, con, query)
 
 
 def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark draining a result with batched fetchmany."""
-    query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)"
+    query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)"
 
     def run() -> None:
         rel = con.execute(query)
@@ -147,17 +151,3 @@ def run() -> None:
                 break
 
     benchmark(run)
-
-
-def test_expr_many(benchmark: BenchmarkFixture) -> None:
-    """Benchmark building many column/constant expressions."""
-
-    def run() -> int:
-        out = []
-        for i in range(2000):
-            col = duckdb.ColumnExpression(f"col_{i}")
-            const = duckdb.ConstantExpression(i)
-            out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}"))
-        return len(out)
-
-    benchmark(run)
diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py
index e3f232cc..c54ddba7 100644
--- a/benchmarks/test_ingest_native_perf.py
+++ b/benchmarks/test_ingest_native_perf.py
@@ -17,24 +17,21 @@
 from typing import TYPE_CHECKING
 
 import pytest
-
-import duckdb
+from _scale import scaled
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
-EXECMANY_N = 20_000  # executemany re-binds + executes per row, keep moderate
-WIDE_N = 10_000  # values() builds a 1-row x N-col relation; cap N so the binder stays sane
+    import duckdb
 
+# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep.
+EXECMANY_N = scaled(20_000)  # executemany re-binds + executes per row, keep moderate
+WIDE_N = scaled(10_000)  # values() builds a 1-row x N-col relation; cap N so the binder stays sane
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
+# gate: native ingest eagerly transforms every cell (TransformPythonValue) / re-binds per row (executemany);
+# the engine side (a trivial INSERT or a 1-row-wide fetchall drain) is negligible -> binding-dominated, GIL-held,
+# deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py.
+pytestmark = pytest.mark.gate
 
 
 @pytest.fixture(scope="module")
diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py
index abbe2a4d..73b99d0d 100644
--- a/benchmarks/test_ingest_numpy_perf.py
+++ b/benchmarks/test_ingest_numpy_perf.py
@@ -18,20 +18,21 @@
 from typing import TYPE_CHECKING
 
 import pytest
+from _scale import scaled
 
-import duckdb
 import numpy as np
 import pandas as pd
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
-N = 500_000
-ANALYZER_N = 200_000
+    import duckdb
+
+# env-gated (INFRA-4): scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below.
+N = scaled(500_000)
+ANALYZER_N = scaled(200_000)
 
-# Module-global for the replacement-scan-from-variable path (frame resolution finds f_globals reliably).
+# Registered explicitly via con.register (MEAS-3) rather than resolved by replacement-scan frame inspection.
 NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}
 
 # Mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only).
@@ -42,12 +43,9 @@
 _MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)]
 
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
+# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()`/`sum(length())` over a registered
+# frame) are engine-aggregate dominated -> informational. The analyzer BIND (count(*), no scan) is a pure
+# per-bind binding cost -> gate.
 
 
 @pytest.fixture(scope="module")
@@ -84,32 +82,42 @@ def df_object_mixed_types() -> pd.DataFrame:
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.informational
 def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark scanning a numpy-dict replacement scan."""
-    benchmark(lambda: con.sql("SELECT sum(a), sum(b) FROM NPDICT").fetchall())
+    """Benchmark scanning a registered numpy dict-of-arrays."""
+    # MEAS-3: register explicitly (not frame-inspection replacement scan) and warm the query before measuring.
+    con.register("npdict", NPDICT)
+    con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall())
 
 
+@pytest.mark.informational
 def test_read_numpy_double_with_nan(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame
 ) -> None:
     """Benchmark scanning a numpy double column with NaNs."""
     con.register("t", df_double_with_nan)
+    con.execute("SELECT sum(a) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
 
 
+@pytest.mark.informational
 def test_read_numpy_masked_int(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame
 ) -> None:
     """Benchmark scanning a masked nullable-int column."""
     con.register("t", df_masked_int)
+    con.execute("SELECT sum(a) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
 
 
+@pytest.mark.informational
 def test_read_numpy_object_string_mixed(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame
 ) -> None:
     """Benchmark scanning a mixed object-string column."""
     con.register("t", df_object_string_mixed)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
@@ -119,6 +127,7 @@ def test_read_numpy_object_string_mixed(
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.gate  # count(*) forces no scan -> the measured cost is the PandasAnalyzer per-bind sampling (binding)
 def test_bind_analyzer_object(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame
 ) -> None:
diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py
index 1a4c09f0..168f1a3d 100644
--- a/benchmarks/test_pandas_perf.py
+++ b/benchmarks/test_pandas_perf.py
@@ -18,28 +18,24 @@
 
 import pyarrow as pa
 import pytest
+from _scale import scaled
 
-import duckdb
 import numpy as np
 import pandas as pd
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
-N = 500_000
-WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)"
-WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)"
-_STRINGS = [f"str_value_{i}" for i in range(N)]
+    import duckdb
 
+N = scaled(500_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
+WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
+WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
+_STRINGS = [f"str_value_{i}" for i in range(N)]
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
+# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()` over a registered frame) are
+# engine-aggregate dominated -> informational. Only the NUMPY-backed df() WRITE is binding-dominated -> gate.
+# The arrow-backed WRITE goes through to_arrow_table().to_pandas() (pyarrow library code, MEAS-2) -> informational.
 
 
 @pytest.fixture(scope="module")
@@ -77,35 +73,43 @@ def df_arrow_string() -> pd.DataFrame:
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.informational
 def test_read_pandas_numpy_numeric(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame
 ) -> None:
     """Benchmark scanning a numpy-backed numeric frame."""
     con.register("t", df_numpy_numeric)
+    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
 
 
+@pytest.mark.informational
 def test_read_pandas_numpy_string(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame
 ) -> None:
     """Benchmark scanning a numpy-backed string frame."""
     con.register("t", df_numpy_string)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
+@pytest.mark.informational
 def test_read_pandas_arrow_numeric(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame
 ) -> None:
     """Benchmark scanning an arrow-backed numeric frame."""
     con.register("t", df_arrow_numeric)
+    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
 
 
+@pytest.mark.informational
 def test_read_pandas_arrow_string(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame
 ) -> None:
     """Benchmark scanning an arrow-backed string frame."""
     con.register("t", df_arrow_string)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm (MEAS-3)
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
@@ -116,11 +120,13 @@ def test_read_pandas_arrow_string(
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.gate
 def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark materializing a numeric result to a numpy-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).df())
 
 
+@pytest.mark.gate
 def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark materializing a string result to a numpy-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).df())
@@ -131,26 +137,30 @@ def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.Duck
 # datetime column (TimestampConvert + ConvertDateTimeTypes).
 
 
+@pytest.mark.gate
 def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark materializing a null-heavy numeric result to a numpy-backed frame."""
     q = (
         "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
-        "CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range(500000) t(i)"
+        f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)"
     )
     benchmark(lambda: con.sql(q).df())
 
 
+@pytest.mark.gate
 def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark materializing a timestamp result to a numpy-backed frame."""
-    q = "SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range(500000) t(i)"
+    q = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)"
     benchmark(lambda: con.sql(q).df())
 
 
+@pytest.mark.informational  # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2)
 def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark materializing a numeric result to an arrow-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
 
 
+@pytest.mark.informational  # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2)
 def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark materializing a string result to an arrow-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py
index 5ad56254..eb54f91c 100644
--- a/benchmarks/test_produce_numpy_perf.py
+++ b/benchmarks/test_produce_numpy_perf.py
@@ -20,17 +20,16 @@
 from typing import TYPE_CHECKING
 
 import pytest
+from _scale import scaled
 
 import duckdb
 import numpy as np  # noqa: F401  (pinned identically A/B; imported so the env matches the other modules)
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
-N = 500_000
-TYPE_N = 200_000  # wide-internal types (hugeint/uuid/decimal128) are heavier per cell
+N = scaled(500_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
+TYPE_N = scaled(200_000)  # wide-internal types (hugeint/uuid/decimal128) are heavier per cell
 
 Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
 Q_NUM_NULLS = (
@@ -44,14 +43,8 @@
 Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)"
 
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
-
-
+# gate: df()/fetchnumpy() fully materialize numpy-backed columns -> binding-dominated (ArrayWrapper fill),
+# GIL-held, deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py.
 def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.sql(query).df()  # warm
     benchmark(lambda: con.sql(query).df())
@@ -67,37 +60,44 @@ def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, qu
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.gate
 def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a numeric result."""
     _bench_df(benchmark, con, Q_NUM)
 
 
+@pytest.mark.gate
 def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a null-heavy numeric result."""
     # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch)
     _bench_df(benchmark, con, Q_NUM_NULLS)
 
 
+@pytest.mark.gate
 def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a string result."""
     _bench_df(benchmark, con, Q_STR)
 
 
+@pytest.mark.gate
 def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a timestamp result."""
     _bench_df(benchmark, con, Q_TS)
 
 
+@pytest.mark.gate
 def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a hugeint result."""
     _bench_df(benchmark, con, Q_HUGEINT)
 
 
+@pytest.mark.gate
 def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a uuid result."""
     _bench_df(benchmark, con, Q_UUID)
 
 
+@pytest.mark.gate
 def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark df() of a 128-bit decimal result."""
     _bench_df(benchmark, con, Q_DEC128)
@@ -108,11 +108,13 @@ def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnecti
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.gate
 def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchnumpy() of a numeric result."""
     _bench_numpy(benchmark, con, Q_NUM)
 
 
+@pytest.mark.gate
 def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark fetchnumpy() of a null-heavy numeric result."""
     _bench_numpy(benchmark, con, Q_NUM_NULLS)
@@ -123,6 +125,7 @@ def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.informational  # per-chunk streaming drain (GIL-per-chunk) -> walltime-informational, not gated
 def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark draining a result with fetch_df_chunk()."""
 
@@ -145,6 +148,7 @@ def run() -> int:
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.informational  # torch is local-only (importorskip -> skipped in CI); torch lib work dilutes it
 def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark torch() of a numeric result (skipped if torch is absent)."""
     pytest.importorskip("torch")
@@ -172,7 +176,7 @@ def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnecti
 
 def test_mem_df_with_nulls() -> None:
     """Guard the Python-tracked peak allocation of a null-heavy df() call."""
-    con = duckdb.connect()
+    con = duckdb.connect(config={"threads": 1})
     try:
         tracemalloc.start()
         warm = con.sql(Q_NUM_NULLS).df()  # populate one-time import / type caches
diff --git a/benchmarks/test_relational_construction_perf.py b/benchmarks/test_relational_construction_perf.py
new file mode 100644
index 00000000..5b386da5
--- /dev/null
+++ b/benchmarks/test_relational_construction_perf.py
@@ -0,0 +1,43 @@
+"""CodSpeed benchmark: relational-API expression construction. Standalone, not in CI's binding gate.
+
+A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
+  cd /Users/evert/projects/duckdb-python/wt-codspeed
+  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
+    $P -m pytest benchmarks/test_relational_construction_perf.py \
+    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
+  done
+
+SCOPE: this is relational-API *construction* (ColumnExpression / ConstantExpression / operator overloads),
+NOT the binding-pressure surface the rest of the suite targets. It was moved here out of test_fetch_perf.py
+(MEAS-5) because it is out of scope for the binding-pressure gate. It is KEPT because it carries a real signal
+(a measured ~35% expression-construction delta at the cutover), so it stays visible -- but it is marked
+`informational`, so it runs and reports and is NEVER part of the gate.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+import duckdb
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+# informational: relational-API construction, deliberately excluded from the binding-pressure gate (MEAS-5).
+pytestmark = pytest.mark.informational
+
+
+def test_expr_many(benchmark: BenchmarkFixture) -> None:
+    """Benchmark building many column/constant expressions."""
+
+    def run() -> int:
+        out = []
+        for i in range(2000):
+            col = duckdb.ColumnExpression(f"col_{i}")
+            const = duckdb.ConstantExpression(i)
+            out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}"))
+        return len(out)
+
+    benchmark(run)
diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py
index 7fb80c4b..f0323fea 100644
--- a/benchmarks/test_types_roundtrip_perf.py
+++ b/benchmarks/test_types_roundtrip_perf.py
@@ -18,15 +18,14 @@
 from typing import TYPE_CHECKING
 
 import pytest
-
-import duckdb
+from _scale import scaled
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
-N = 100_000
+    import duckdb
+
+N = scaled(100_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
 
 # one logical type per column; long-varchar is intentionally > 64 chars
 TYPE_EXPR = {
@@ -34,6 +33,8 @@
     "double": "(i * 1.5)::DOUBLE",
     "varchar_short": "('str_' || i)",
     "varchar_long": "('row_' || i || '_' || repeat('payload ', 9))",
+    "date": "DATE '2020-01-01' + (i % 3650)::INTEGER",
+    "bool": "(i % 2 = 0)",
     "timestamp": "TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND)",
     "decimal64": "((i::DECIMAL(18, 3)) / 1000)",
     "decimal128": "((i * 1.5)::DECIMAL(28, 6))",
@@ -45,18 +46,12 @@
 TYPES = list(TYPE_EXPR)
 
 
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
-
-
+# `con` fixture + threads=1 live in conftest.py.
 def _query(type_name: str) -> str:
     return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)"
 
 
+@pytest.mark.gate  # OUT-row fetchall -> binding-dominated per-type dispatch
 @pytest.mark.parametrize("type_name", TYPES)
 def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
     """Benchmark fetchall of one logical type per column."""
@@ -65,6 +60,7 @@ def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConne
     benchmark(lambda: con.execute(q).fetchall())
 
 
+@pytest.mark.gate  # OUT-col df() -> binding-dominated ArrayWrapper fill per type
 @pytest.mark.parametrize("type_name", TYPES)
 def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
     """Benchmark df() of one logical type per column."""
@@ -73,6 +69,7 @@ def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection,
     benchmark(lambda: con.sql(q).df())
 
 
+@pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated
 @pytest.mark.parametrize("type_name", TYPES)
 def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
     """Benchmark to_arrow_table() of one logical type per column (informational only)."""
diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py
index 34896bcc..a62be815 100644
--- a/benchmarks/test_udf_perf.py
+++ b/benchmarks/test_udf_perf.py
@@ -16,30 +16,24 @@
 from typing import TYPE_CHECKING
 
 import pytest
+from _scale import scaled
 
-import duckdb
 from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
-
     from pytest_codspeed import BenchmarkFixture
 
+    import duckdb
+
 pa = pytest.importorskip("pyarrow")
 pc = pytest.importorskip("pyarrow.compute")
 
-NATIVE_N = 200_000  # native = one Python call per row, keep moderate
-ARROW_N = 1_000_000  # arrow = one Python call per chunk (vectorized), can be large
-
-
-@pytest.fixture
-def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a fresh connection, closed on teardown."""
-    c = duckdb.connect()
-    yield c
-    c.close()
+# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep.
+NATIVE_N = scaled(200_000)  # native = one Python call per row, keep moderate
+ARROW_N = scaled(1_000_000)  # arrow = one Python call per chunk (vectorized), can be large
 
 
+# `con` fixture + threads=1 live in conftest.py.
 def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.execute(query).fetchall()  # warm the engine + import caches before measuring
     benchmark(lambda: con.execute(query).fetchall())
@@ -50,24 +44,28 @@ def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: s
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.gate  # native scalar UDF: one Python call per row dominates; the sum() consume is negligible
 def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark a 1-arg native int scalar UDF."""
     con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
     _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)")
 
 
+@pytest.mark.gate
 def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark a 2-arg native int scalar UDF."""
     con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT)
     _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)")
 
 
+@pytest.mark.gate
 def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark a 1-arg native double scalar UDF."""
     con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE)
     _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)")
 
 
+@pytest.mark.gate
 def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark a native string scalar UDF."""
     con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR)
@@ -78,6 +76,7 @@ def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConn
     )
 
 
+@pytest.mark.gate
 def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark the validity short-circuit for NULL inputs to a native UDF."""
     # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the
@@ -96,18 +95,21 @@ def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBP
 # --------------------------------------------------------------------------- #
 
 
+@pytest.mark.informational  # vectorized arrow UDF: pyarrow.compute lib work + per-chunk conversion + 1M engine
 def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark a vectorized arrow int UDF."""
     con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
     _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)")
 
 
+@pytest.mark.informational
 def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark a vectorized arrow double UDF."""
     con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow")
     _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)")
 
 
+@pytest.mark.informational
 def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     """Benchmark the selvec compaction for NULL inputs to a vectorized arrow UDF."""
     # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call
diff --git a/pyproject.toml b/pyproject.toml
index fd0ef328..90218094 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -281,6 +281,22 @@ test = [ # dependencies used for running tests
     "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'",
     "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'",
 ]
+bench = [ # minimal, pinned deps for the CodSpeed benchmark suite (.github/workflows/codspeed.yml). Deliberately
+          # NOT the heavy `test` group (no torch/tensorflow/pyspark/adbc). Pinned via uv.lock and kept in lockstep
+          # with any baseline compared against, so the only cross-run delta is the binding. Constraints mirror the
+          # `test` group so the lockfile resolves identically. torch/tf produce paths are local-only (importorskip).
+    "pytest",
+    "pytest_codspeed",
+    "polars>=1.33.0",
+    "pytz",
+    "numpy<2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version < '3.12'",
+    "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'",
+    "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'",
+    "pandas>=3.0.0; python_version > '3.10'",
+    "pandas<3.0.0; python_version < '3.11'",
+    "pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
+    "pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
+]
 scripts = [ # dependencies used for running scripts
     "cxxheaderparser",
     "ipython",
diff --git a/tests/fast/test_binding_pressure_leak.py b/tests/fast/test_binding_pressure_leak.py
new file mode 100644
index 00000000..22de87b2
--- /dev/null
+++ b/tests/fast/test_binding_pressure_leak.py
@@ -0,0 +1,113 @@
+"""Sustained-iteration leak guards for the binding object-pinning paths (COV-3).
+
+Sibling of test_relation_dependency_leak.py. CodSpeed measures steady-state PER-CALL cost and structurally cannot
+see a per-call refcount imbalance in the object-pinning graph (ExternalDependency / registered_objects / a UDF's
+retained Python callable) until it OOMs. This is a plain assertion test (NOT a codspeed benchmark, no marker): it
+runs each pinning path N times and asserts RSS and Python-object growth stay flat.
+
+Covers the paths the existing leak test does not: register/unregister, native + arrow UDF create/run/remove, and
+executemany. (from_arrow/from_df/replacement-scan pinning is already covered by test_relation_dependency_leak.py.)
+"""
+
+import gc
+import os
+
+import pytest
+
+import numpy as np
+import pandas as pd
+
+try:
+    import pyarrow as pa
+
+    can_arrow = True
+except ImportError:
+    can_arrow = False
+
+from duckdb.sqltypes import BIGINT
+
+psutil = pytest.importorskip("psutil")
+
+ITERS = 100
+ROWS = 100_000
+_EM_ROWS = [(i, i * 1.5, f"s{i}") for i in range(5_000)]
+
+
+def _rss_gb():
+    return psutil.Process(os.getpid()).memory_info().rss / (10**9)
+
+
+def check_flat(fn, cursor, iters=ITERS, obj_slack=20_000):
+    """Assert RSS and tracked-object count stay flat across `iters` calls of `fn`."""
+    fn(cursor)  # warm one-time caches so they are not counted as growth
+    gc.collect()
+    start_rss = _rss_gb()
+    start_obj = len(gc.get_objects())
+    for _ in range(iters):
+        fn(cursor)
+    gc.collect()
+    end_rss = _rss_gb()
+    end_obj = len(gc.get_objects())
+    # RSS ratio bound mirrors test_relation_dependency_leak.py (growth must stay well under 3x)...
+    assert end_rss / 3 < start_rss, f"RSS grew {start_rss:.3f} -> {end_rss:.3f} GB over {iters} iters"
+    # ...plus an object-count bound, which catches a Python-object pin that is too small to move RSS.
+    assert end_obj - start_obj < obj_slack, f"tracked objects grew by {end_obj - start_obj} over {iters} iters"
+
+
+# --------------------------------------------------------------------------- #
+# Pinning paths (one full pin/unpin cycle per call).
+# --------------------------------------------------------------------------- #
+
+
+def register_unregister_arrow(cursor):
+    tbl = pa.table({"a": pa.array(np.arange(ROWS), type=pa.int64())})
+    cursor.register("t_reg", tbl)
+    cursor.execute("SELECT sum(a) FROM t_reg").fetchall()
+    cursor.unregister("t_reg")
+
+
+def register_unregister_pandas(cursor):
+    df = pd.DataFrame({"a": np.arange(ROWS)})
+    cursor.register("t_reg", df)
+    cursor.execute("SELECT sum(a) FROM t_reg").fetchall()
+    cursor.unregister("t_reg")
+
+
+def native_udf_cycle(cursor):
+    cursor.create_function("f_leak", lambda x: x + 1, [BIGINT], BIGINT)
+    cursor.execute("SELECT sum(f_leak(i::BIGINT)) FROM range(10000) t(i)").fetchall()
+    cursor.remove_function("f_leak")
+
+
+def arrow_udf_cycle(cursor):
+    import pyarrow.compute as pc
+
+    cursor.create_function("af_leak", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+    cursor.execute("SELECT sum(af_leak(i::BIGINT)) FROM range(50000) t(i)").fetchall()
+    cursor.remove_function("af_leak")
+
+
+def executemany_cycle(cursor):
+    cursor.execute("CREATE OR REPLACE TABLE t_em (a BIGINT, b DOUBLE, c VARCHAR)")
+    cursor.executemany("INSERT INTO t_em VALUES (?, ?, ?)", _EM_ROWS)
+
+
+class TestBindingPressureLeak:
+    def test_register_unregister_arrow_leak(self, duckdb_cursor):
+        if not can_arrow:
+            pytest.skip("pyarrow not installed")
+        check_flat(register_unregister_arrow, duckdb_cursor)
+
+    def test_register_unregister_pandas_leak(self, duckdb_cursor):
+        check_flat(register_unregister_pandas, duckdb_cursor)
+
+    def test_native_udf_cycle_leak(self, duckdb_cursor):
+        check_flat(native_udf_cycle, duckdb_cursor)
+
+    def test_arrow_udf_cycle_leak(self, duckdb_cursor):
+        if not can_arrow:
+            pytest.skip("pyarrow not installed")
+        check_flat(arrow_udf_cycle, duckdb_cursor)
+
+    def test_executemany_leak(self, duckdb_cursor):
+        check_flat(executemany_cycle, duckdb_cursor)

From 090e02142b1bca4163c526ad75a4dcc84a5ae374 Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Thu, 2 Jul 2026 07:39:17 +0200
Subject: [PATCH 5/7] less text

---
 .github/workflows/codspeed.yml                |  65 ++--
 benchmarks/PLAN.md                            | 290 +++++-------------
 benchmarks/README.md                          |  34 ++
 benchmarks/_scale.py                          |  23 +-
 benchmarks/compare_baseline.py                |  66 ++--
 benchmarks/conftest.py                        |  37 +--
 benchmarks/requirements-bench.txt             |  12 +-
 benchmarks/test_arrow_perf.py                 |  63 +---
 benchmarks/test_cardinality_perf.py           |  34 +-
 benchmarks/test_concurrency_perf.py           |  63 ++--
 benchmarks/test_engine_control_perf.py        |  36 +--
 benchmarks/test_fetch_perf.py                 |  53 +---
 benchmarks/test_ingest_native_perf.py         |  29 +-
 benchmarks/test_ingest_numpy_perf.py          |  62 ++--
 benchmarks/test_pandas_perf.py                |  67 +---
 benchmarks/test_produce_numpy_perf.py         |  86 ++----
 .../test_relational_construction_perf.py      |  22 +-
 benchmarks/test_types_roundtrip_perf.py       |  30 +-
 benchmarks/test_udf_perf.py                   |  46 +--
 pyproject.toml                                |  10 +-
 tests/fast/test_binding_pressure_leak.py      |  12 +-
 21 files changed, 337 insertions(+), 803 deletions(-)
 create mode 100644 benchmarks/README.md

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index 2b7b1664..b80323fc 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -1,30 +1,18 @@
-# Perf-regression benchmarks: instruction-count (Callgrind) gating against a COMMITTED baseline.
+# Instruction-count (Callgrind) perf-regression gate against a COMMITTED baseline. No CodSpeed account/token/runner:
+# compare_baseline.py parses raw callgrind dumps and diffs each benchmark against benchmarks/baseline.json. Counts
+# are near-deterministic with PYTHONHASHSEED pinned (~0.1% noise), so the 5% gate threshold sits far above it.
+# Details + rationale: benchmarks/README.md and benchmarks/PLAN.md.
 #
-# NO CodSpeed account/dashboard/token/runner. pytest-codspeed's hooks call callgrind_dump_stats_at(<uri>) per
-# benchmark, so a self-hosted `valgrind --tool=callgrind` run writes one dump per benchmark, headed by
-# `desc: Trigger: Client Request: <uri>` with the count on `totals:` (events: Ir). benchmarks/compare_baseline.py
-# parses those dumps and diffs each benchmark against benchmarks/baseline.json (the committed instruction-count
-# baseline). Counts are near-deterministic under Callgrind with PYTHONHASHSEED pinned (~0.1% noise observed;
-# often bit-identical), so a 5% default gate threshold sits far above noise. Validated on a Linux+valgrind box.
+# Triggers: nightly schedule + manual workflow_dispatch (no pull_request/push). A dispatch on a feature branch
+# compares that branch's counts vs the baseline.json committed on it, answering "did my branch regress vs main".
 #
-# TRIGGERS: nightly `schedule` + manual `workflow_dispatch`. No pull_request/push (and no `paths:` -- neither
-# schedule nor dispatch honors it). A dispatch on a feature branch compares that branch's benchmark counts vs the
-# baseline.json committed on the branch (i.e. main's baseline), answering "did my branch regress vs main".
+# Modes (workflow_dispatch input `regen`):
+#   regen=false (default) -> COMPARE + report. Report-only for now (never fails); flip to --enforce once trusted.
+#   regen=true            -> write a fresh baseline.json + upload as an artifact to commit deliberately. Bump
+#                            requirements-bench.txt FIRST (separate commit) if the pins should change.
 #
-# MODES (workflow_dispatch input `regen`):
-#   regen=false (default) -> COMPARE: run + diff vs baseline.json, print a report. REPORT-ONLY for now (never
-#                            fails the job); flip compare_baseline.py to --enforce once trusted.
-#   regen=true            -> REGENERATE: run + write a fresh baseline.json (per-bench counts + provenance meta +
-#                            Option-B binding fractions/auto-move) and upload it as an artifact to commit
-#                            deliberately. Bump benchmarks/requirements-bench.txt in a separate commit FIRST if
-#                            the pins should change, then regen so the baseline matches the committed pins.
-#
-# The concurrency module is EXCLUDED from the Callgrind sweep: Callgrind serializes threads, so its signal
-# (wall-clock GIL contention) is meaningless here; it stays a local walltime tool.
-#
-# MEMORY MODE (a second Callgrind sweep for O(rows) produce peak-RSS) is DESIGNED but DEFERRED -- see PLAN.md.
-#
-# Valgrind is slow (~20-50x); timeout-minutes is a conservative guess -- calibrate after the first CI run.
+# The concurrency module is excluded from the sweep (Callgrind serializes threads, so its signal is meaningless).
+# Memory mode (a second sweep for produce peak-RSS) is deferred (see PLAN.md).
 
 name: Benchmarks
 
@@ -45,17 +33,14 @@ concurrency:
 jobs:
   benchmarks:
     runs-on: ubuntu-latest
-    timeout-minutes: 90 # measured: ~25 min Callgrind sweep at BENCH_SCALE=10 (12-core Linux) + cold build ~10 min; margin for CI
+    timeout-minutes: 90 # ~25 min sweep at BENCH_SCALE=10 (12-core Linux) + ~10 min cold build; margin for CI
     permissions:
       contents: read
     env:
-      PYTHONHASHSEED: "0" # pin hash randomization so dict/struct paths give stable instruction counts (INFRA-6)
-      CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks (the callgrind_dump_stats_at markers)
-      # env-gated row counts (INFRA-4): shrink the O(rows)/per-row-object benchmarks so the Callgrind sweep fits
-      # under timeout-minutes. Local runs leave this unset -> full N. Recorded in baseline.json meta.bench_scale;
-      # a baseline is only comparable to a run at the SAME scale. Calibrated on a 12-core Linux+valgrind box:
-      # BENCH_SCALE=10 -> ~25 min full sweep, and the Option-B move-list matches full-N (fractions shift slightly
-      # but stay the same side of the cutoff). Most benches floor at 20k rows (_scale.FLOOR), still row-dominated.
+      PYTHONHASHSEED: "0" # stable instruction counts for dict/struct paths
+      CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks
+      # shrink the O(rows) benches so the sweep fits under timeout-minutes. Local runs leave this unset -> full N.
+      # Recorded in baseline.json meta.bench_scale; a baseline only compares to a run at the SAME scale.
       BENCH_SCALE: "10"
     steps:
       - uses: actions/checkout@v4
@@ -95,22 +80,19 @@ jobs:
         run: |
           # step 1: build deps only (needed for --no-build-isolation), no project
           uv sync --only-group build --no-install-project -p 3.13
-          # step 2: build+install the project (release) + build group, WITHOUT the heavy default `dev` group
-          # (torch/tensorflow/pyspark). uv.lock is gitignored, so it is deliberately NOT relied on for bench deps.
+          # step 2: build+install the project (release) + build group, without the heavy default `dev` group
           uv sync --no-build-isolation --no-editable --reinstall --no-default-groups --group build -p 3.13
-          # step 3: install the FROZEN, committed bench pins (exact ==). Regenerated deliberately with the baseline
-          # (source list: pyproject [dependency-groups] bench), so the only cross-run delta is the binding.
+          # step 3: the frozen bench pins (exact ==), so the only cross-run delta is the binding
           uv pip install -r benchmarks/requirements-bench.txt
 
       - name: Collect gate node-ids
-        # the gate/informational split (conftest markers) classifies which benchmarks are gate-able; regen uses it
+        # the gate/informational marker split; regen uses it to classify each benchmark
         run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \
              | grep '::' > gate_list.txt || true
 
       - name: Run benchmarks under Callgrind (per-benchmark instruction counts)
-        # ONE sweep over all gate+informational benchmarks EXCEPT the concurrency module (Callgrind serializes
-        # threads -> its wall-clock signal is meaningless and it is expensive). Each benchmark emits a callgrind
-        # dump keyed by its uri. The pytest-codspeed hooks obj-skip libpython, so counts are clean.
+        # ONE sweep over gate+informational EXCEPT the concurrency module (thread-serialized, expensive). Each
+        # benchmark emits a callgrind dump keyed by its uri.
         run: |
           mkdir -p profiles
           CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \
@@ -121,8 +103,7 @@ jobs:
 
       - name: Compare against committed baseline (report-only)
         if: ${{ !inputs.regen }}
-        # report-only for now: prints the per-benchmark delta table and NEVER fails the job. Add --enforce here
-        # once trusted to fail on a gate regression (informational benches never fail).
+        # report-only: prints the delta table, never fails the job. Add --enforce once trusted.
         run: |
           uv run --no-sync python benchmarks/compare_baseline.py compare \
             --profiles profiles --baseline benchmarks/baseline.json \
diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md
index 54786083..835aef7d 100644
--- a/benchmarks/PLAN.md
+++ b/benchmarks/PLAN.md
@@ -1,77 +1,40 @@
-# CodSpeed Benchmark Suite Plan — duckdb-python binding hot paths
-
-Grounded in the binding source on `perf/codspeed` (`src/`). File:line citations are to this tree.
-
-## 0. Conventions (from the existing 3 modules, keep these)
-
-- Function-scoped `con` fixture; module-scoped input-data fixtures.
-- READ = `SELECT sum(col) / sum(length(col))` (never `count(*)`, which is answered from metadata).
-- WRITE = eager materialize or fully drain the lazy reader.
-- Warm the engine once (`con.execute(query).fetchall()`) before `benchmark(...)` so first-call import-cache population is not charged to the measured region.
-- Pin numpy/pandas/pyarrow/polars identically across A/B so deltas are pure binding cost.
-
-Ranking: **P0** = on a known regression path or the cutover-reworked code (narrow-numeric common case); **P1** = high-traffic conversion / per-element Python work; **P2** = correctness-relevant, lower traffic or engine-dominated.
-
-## (a) Prioritized scenarios
-
-### PRODUCE (duckdb -> external) — highest regression risk
-
-Row path: `DuckDBPyResult::Fetchone` (`src/pyresult.cpp:126-151`) builds a `PyUtil::TupleBuilder` (`src/include/duckdb_python/pyutil.hpp:101-125`) per row and calls `PythonObject::FromValue` (`src/native/python_objects.cpp:474`) per cell. O(rows x cols). This is the shape of the historical ~15% fetchall regression.
-
-| # | Scenario | SQL / setup | Measures | Pri |
-|---|----------|-------------|----------|-----|
-| P0-1 | fetchall int64 1col | `SELECT i::BIGINT a FROM range(1_000_000)` | TupleBuilder + FromValue int (`python_objects.cpp:489`) | P0 |
-| P0-2 | fetchall int 2-4col | `SELECT i::BIGINT,(i+1)::BIGINT,(i*2)::INTEGER FROM range(1_000_000)` | TupleBuilder scaling w/ col count | P0 |
-| P0-3 | fetchall double | `SELECT (i*1.5)::DOUBLE FROM range(1_000_000)` | FromValue double | P0 |
-| P0-4 | fetchall varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | FromValue VARCHAR string copy (`python_objects.cpp:515`) | P1 |
-| P0-5 | fetchone loop (overhead) | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(100_000)` | per-call Fetchone + chunk-boundary FetchNext + GIL cycle | P0 |
-| P0-6 | fetchmany batched | as P0-5, `fetchmany(10_000)` loop | Fetchmany loop | P1 |
-| P1-7 | **df() numeric (reworked)** | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(1_000_000)` | FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular, `HAS_NULLS=false/PANDAS=true` branch (`array_wrapper.cpp:415-425`) | P0 |
-| P1-8 | **df() numeric WITH NULLS** | `SELECT CASE WHEN i%10=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | `HAS_NULLS=true` + masked_array build (`array_wrapper.cpp:743-757`) + masked->pd.NA rewrite (`pyresult.cpp:362-393`) | P0 |
-| P1-9 | fetchnumpy numeric | as P1-7 | FetchNumpyInternal without the DataFrame wrap | P1 |
-| P1-10 | df() varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | StringConvert PyUnicode_FromStringAndSize per row (`array_wrapper.cpp:164-181`) | P1 |
-| P1-11 | df() timestamp | `SELECT TIMESTAMP '2020-01-01'+(i*INTERVAL 1 SECOND) FROM range(1_000_000)` | TimestampConvertNano + ConvertDateTimeTypes (`pyresult.cpp:299`) | P1 |
-| P1-13 | to_record_batch_reader drained | `range(1_000_000)`, `to_record_batch_reader(100_000)` | lazy stream (`pyresult.cpp:573`), iterate + sum num_rows | P1 |
-| P2-15 | torch()/tf() numeric | `range(500_000)` | FetchNumpyInternal + per-col from_numpy (`pyresult.cpp:405-421`) | P2 |
-| P2-16 | fetch_df_chunk | large query, loop `fetch_df_chunk()` | FetchDFChunk per chunk (`pyresult.cpp:400`) | P2 |
-| P1-17 | fetchall LIST<int> | `SELECT [i,i+1,i+2] FROM range(200_000)` | FromValue LIST recursion (`python_objects.cpp:651`) | P1 |
-| P1-18 | fetchall STRUCT | `SELECT {'a':i,'b':i+1} FROM range(200_000)` | FromStruct dict build (`python_objects.cpp:390-414`) | P1 |
-| P1-20 | fetchall DECIMAL | `SELECT (i::DECIMAL(18,3))/1000 FROM range(200_000)` | Python `Decimal()(val.ToString())` per row (`python_objects.cpp:507`) | P1 |
-| P1-21 | fetchall TIMESTAMPTZ | `SELECT (TIMESTAMPTZ '2020-01-01'+(i*INTERVAL 1 SECOND)) FROM range(100_000)` | pytz localize+astimezone per row (`python_objects.cpp:567-573`) | P1 |
-| P2-22 | fetchall NULL-heavy | `SELECT CASE WHEN i%2=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | validity branch + nb::none (`pyresult.cpp:142`) | P2 |
-| P2-23 | fetchall BLOB | `SELECT ('blob_'||i)::BLOB FROM range(200_000)` | nb::bytes (`python_objects.cpp:517`) | P2 |
-
-### INGEST (external -> duckdb)
-
-| # | Scenario | Setup | Path | Pri |
-|---|----------|-------|------|-----|
-| I0-1 | **pandas numpy int64/double** | DataFrame 1M | NumpyScan::Scan ScanNumpyMasked zero-copy when stride==sizeof(T); double NaN->NULL loop (`numpy_scan.cpp:76-112,236-246`) reworked | P0 |
-| I0-2 | **pandas numpy object-string** | `pd.array(strings,dtype=object)` 500k | NumpyScan STRING/OBJECT: per-row isinstance, PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode transcode (`numpy_scan.cpp:353-452`) reworked | P0 |
-| I1-3 | pandas object bind-time analyzer | object col 100k+ | Pandas::Bind -> PandasAnalyzer::Analyze samples rows GetItemType ladder (`analyzer.cpp:356-460`). Per-BIND overhead, independent of rows (count(*) ok here) | P1 |
-| I1-4 | pandas arrow-backed | pd.ArrowDtype 1M | ToArrowTable -> arrow scan (`pyconnection.cpp:1799`) | P1 |
-| I0-5 | arrow Table | 1M | CreateArrowScan PythonTableArrowArrayStreamFactory near-zero-copy (`python_replacement_scan.cpp:55-83`) | P1 |
-| I1-6 | arrow RecordBatchReader | from_batches | same factory, streaming (distinct from Table) | P1 |
-| I1-7 | polars DataFrame | 1M | entry.to_arrow() one-time + arrow scan (`replacement_scan.cpp:150-156`) | P2 |
-| I1-8 | numpy ndarray + dict-of-arrays | np.arange | replacement scan -> pandas_scan (`replacement_scan.cpp:163-200`) | P2 |
-| I1-9 | **native values() list-of-tuples** | `con.values([(i,i*1.5,'s') for i in range(100_000)])` | Values -> TransformPythonValue per cell, GetPythonObjectType ladder (`python_conversion.cpp:402-454,1075`) | P1 |
-| I1-10 | native list-of-dicts | list of dicts | TransformDictionaryToStruct recursion (`python_conversion.cpp:119`) | P2 |
-| I1-11 | executemany params | INSERT ?,?  100k sets | ExecuteMany loop, TransformPythonValue per set (`pyconnection.cpp:500-544`) | P2 |
-| I2-12 | read_parquet/csv/json | a file | arg marshal -> TableFunction under GIL-release; engine-dominated | P2 |
-
-### UDF (`src/python_udf.cpp`) — zero coverage today
-
-| # | Scenario | Setup | Path | Pri |
-|---|----------|-------|------|-----|
-| U0-1 | **scalar native 1 int arg** | `def f(x):return x+1`, `SELECT sum(f(i::BIGINT)) FROM range(1_000_000)` | per-row TupleBuilder args + PyObject_CallObject + TransformPythonObject result (`python_udf.cpp:320-384`) | P0 |
-| U0-2 | scalar native 2-3 args | `def f(a,b):return a+b` 2 cols 1M | arg-tuple scaling | P1 |
-| U1-3 | scalar native string | `def f(s):return s.upper()` 500k | VARCHAR in + string out | P1 |
-| U1-4 | scalar native NULL inputs | 50% NULL, DEFAULT handling | SetNull short-circuit (`python_udf.cpp:340-350`) | P1 |
-| U1-6 | **vectorized arrow UDF** | `type='arrow'` pc.add 1M | ConvertDataChunkToPyArrowTable + call + ConvertArrowTableToVector cast (`python_udf.cpp:33-144,225`) | P0 |
-| U2-7 | vectorized NULL slicing | DEFAULT + nulls | selvec compaction/reconstruction (`python_udf.cpp:197-305`) | P2 |
-
-## (b) Type x direction matrix
-
-Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper), OUT-arrow.
+# Benchmark suite plan
+
+Design rationale for the binding micro-benchmarks. The suite is implemented in `benchmarks/`; CI lives in
+`../.github/workflows/codspeed.yml`; conventions, markers, and the two data-pattern traps are in
+[README.md](README.md).
+
+Priority: **P0** = known-regression or cutover-reworked path (narrow-numeric common case); **P1** = high-traffic
+conversion or per-element Python work; **P2** = correctness-relevant, lower-traffic or engine-dominated.
+
+## Scenarios
+
+PRODUCE (duckdb to Python) is the highest regression risk: `Fetchone` builds a `TupleBuilder` per row and calls
+`FromValue` per cell (O(rows x cols), the shape of the historical ~15% fetchall regression).
+
+- **OUT-row** (`test_fetch_perf`, `test_types_roundtrip_perf`): fetchall / fetchone / fetchmany per type. P0
+  narrow numeric; P1 varchar, list, struct, and the expensive per-row types (decimal `Decimal()`, timestamptz
+  pytz, hugeint string round-trip, uuid). Small-N `*_gate` probes isolate the compile+fetch fixed cost.
+- **OUT-col** (`test_produce_numpy_perf`): df() / fetchnumpy() reworked columnar path. P0 numeric no-null vs
+  REAL-null (the masked_array branch); plus string, timestamp, and wide-internal (hugeint/uuid/decimal128).
+- **OUT-arrow / polars** (`test_arrow_perf`): to_arrow_table / reader / pl(). Informational (engine-parallel,
+  GIL-released).
+- **Cardinality** (`test_cardinality_perf`): a LIMIT-n sweep giving a clean per-row conversion slope.
+
+INGEST (Python to duckdb):
+
+- **numpy / pandas** (`test_ingest_numpy_perf`, `test_pandas_perf`): numpy-backed scan (NaN-to-NULL, masked),
+  object-string transcode ladder, arrow-backed zero-copy, and the per-bind PandasAnalyzer.
+- **arrow** (`test_arrow_perf`): Table + RecordBatchReader + dictionary sweep.
+- **native** (`test_ingest_native_perf`): values() list/tuple/dict per-cell TransformPythonValue, executemany.
+
+UDF (`test_udf_perf`, zero coverage before this suite): native scalar per-row (P0, the biggest untested per-call
+path) and vectorized arrow per-chunk.
+
+## Type x direction matrix
+
+Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper),
+OUT-arrow.
 
 | Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow |
 |------|-----------|----------|---------|---------|-----------|
@@ -79,138 +42,49 @@ Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (Fro
 | double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 |
 | varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 |
 | bool | P2 | P1 | P2 | P1 | P2 |
-| decimal | P2 | n/a | **P1** (Python Decimal) | P1 | P2 |
+| decimal64/128 | P2 | n/a | **P1** (Python Decimal) | P1 | P2 |
 | date | P2 | P1 | P1 | P1 | P2 |
-| timestamp | P1 | **P1** | P1 | P1 | P1 |
-| timestamptz | P2 | P1 | **P1** (pytz/row) | P1 | P2 |
-| time/interval | P2 | P1 | P1 | P1 | P2 |
-| LIST/ARRAY | P2 | P2 | P1 (recursive) | P1 | P2 |
-| STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 |
-| MAP | P2 | P2 | P2 | P2 | P2 |
-| blob | P2 | P2 | P2 | P2 | P2 |
-| NULL-heavy | - | **P1** | P2 | **P0** (masked_array) | P1 |
-| enum/category | - | P1 | P1 | P1 | P2 |
-
-Minimum viable to ship: int64, double, varchar, timestamp, decimal, LIST, STRUCT, NULL-heavy in OUT-row and OUT-col; int64/double/varchar in IN-numpy.
-
-## (c) Gaps vs the existing 3 modules
-
-Covered well: OUT-row narrow numeric, OUT-arrow/polars numeric+string, pandas IN/OUT numpy-vs-arrow numeric+string, fetchone-loop numeric.
-
-Missing:
-1. **PRODUCE columnar reworked path under-covered** — df() only 500k, only numeric/string, never with NULLS (the masked-array branch is exactly what changed). Add df-with-nulls, fetchnumpy, df-timestamp.
-2. **UDFs: zero coverage** — whole subsystem (python_udf.cpp), native per-row is the single biggest untested per-call-overhead path. Add U0-1/U0-2/U1-3/4/U1-6.
-3. **Native Python ingest: zero coverage** — values()/list-of-tuples/list-of-dicts/executemany via TransformPythonValue. Add I1-9/10/11.
-4. **Expensive scalar OUT-row types untested** — decimal, timestamptz, interval, isolated LIST/STRUCT/MAP. Add P1-17..21.
-5. **Object-column bind-time analyzer untested** — PandasAnalyzer sampling, per-bind cost. Add I1-3.
-6. **Size regimes thin** — add 1M throughput AND 1-row overhead variants.
-7. **Arrow ingest only pa.table** — add RecordBatchReader, polars, numpy-ndarray ingest.
-8. **NULL-heavy IN-numpy untested** (ScanNumpyMasked + ApplyMask).
-
-## (d) Suite organization + CodSpeed mechanics
-
-```
-benchmarks/
-  test_fetch_perf.py            # EXISTING — OUT-row. Add: nested, decimal, timestamptz, null-heavy, 1M+1-row
-  test_arrow_perf.py            # EXISTING — add RecordBatchReader ingest, materialized vs stream
-  test_pandas_perf.py           # EXISTING — add df()-with-nulls, datetime, fetchnumpy, analyzer bind
-  test_produce_numpy_perf.py    # NEW — df()/fetchnumpy/fetch_df_chunk reworked columnar, per-type, null vs no-null
-  test_ingest_native_perf.py    # NEW — values()/list-of-tuples/list-of-dicts/executemany
-  test_ingest_numpy_perf.py     # NEW — numpy ndarray / object-string scan / analyzer bind
-  test_udf_perf.py              # NEW — scalar native + vectorized arrow UDFs
-  test_types_roundtrip_perf.py  # NEW — type x direction matrix sweep, parametrized
-```
-One module per binding subsystem so a CodSpeed report points at one src/ area. torch/tf go in produce_numpy (wrap FetchNumpyInternal); polars stays in arrow (wraps FetchArrowTable).
-
-> **Note (reconciled to the implemented model).** The prose below originally described a per-PR CodSpeed
-> commit-diff gate. That is NOT how the suite works now. The implemented model is: **nightly `schedule` +
-> manual `workflow_dispatch`** (no per-PR trigger, no CodSpeed account/token/runner), a **self-hosted
-> `valgrind --tool=callgrind`** sweep that emits one dump per benchmark, and **`compare_baseline.py`** diffing
-> those counts against a **committed `benchmarks/baseline.json`**. See `.github/workflows/codspeed.yml`.
-
-### Walltime vs instruction-count
-
-- **Local A/B (macOS arm64): walltime only** (no Valgrind), `--codspeed-mode=walltime`.
-- **CI: instruction-count via self-hosted Callgrind (Linux)**, near-deterministic (~0.1% noise with
-  `PYTHONHASHSEED=0`; often bit-identical) — compared against the committed baseline, **report-only** for now
-  (flip `compare_baseline.py` to `--enforce` when trusted).
-
-### Marker split + committed-baseline gate (INFRA-1 / Phase-3)
-
-- Every benchmark carries exactly one of `@pytest.mark.gate` / `@pytest.mark.informational` (registered in
-  `conftest.py`). **gate** = binding-dominated, instruction-count-meaningful (fetchone loop, fetchall/fetchmany,
-  df()/fetchnumpy, native UDF per-call, native values()/executemany ingest, analyzer bind, per-element
-  converters). **informational** = engine/library/streaming-diluted, reported but never gated
-  (`to_arrow_table`/`pl()`/`to_pandas` GIL-released re-runs; registered-frame `SELECT sum()` reads;
-  streaming drains; the concurrency module).
-- **Engine floors + Option-B (MEAS-1).** `test_engine_control_perf.py` measures `SELECT sum(...) FROM range(N)`
-  with no Python egress — the engine floor. At baseline **regen**, each mapped numeric-produce gate's binding
-  fraction `= 1 - floor_Ir/bench_Ir` is computed; a gate below the ~25% cutoff is **auto-moved to
-  informational** (a threshold on an engine-diluted total is not meaningful) and the fraction is stored in
-  `baseline.json` for audit. MEAS-1 showed OUT-row fetch and UDFs are ~all binding (stay gate); numeric
-  produce (`df()`/`fetchnumpy`) is a bulk memcpy of ~engine magnitude (auto-move candidate).
-- **Small-N gates are compile+fetch fixed-cost**, not pure fetch (MEAS-1: ~60% compile+engine at `range(2048)`).
-- **Engine-bump guard.** `compare_baseline.py` compares the committed submodule SHA against the baseline's; if
-  they differ, engine-inclusive deltas may reflect the engine bump, so gate deltas are not enforced (regen the
-  baseline for the new engine).
-- **Reproducibility.** `benchmarks/requirements-bench.txt` (frozen `==` pins, from the `[dependency-groups]
-  bench` list) + `benchmarks/baseline.json` are the co-regenerated pair; CI installs the frozen pins (NOT the
-  gitignored `uv.lock`), so the only cross-run delta is the binding.
-
-Still **informational / do NOT gate** (engine/parallel/IO/library dominated):
-- to_arrow_table / pl() on materialized results (PromoteMaterializedToArrow re-runs GIL-released).
-- registered-frame `SELECT sum()` ingest reads (engine aggregate dominates).
-- read_csv/parquet/json; GIL-per-chunk streaming drains.
-
-### New coverage dimensions (beyond the converter surface)
-
-- **Concurrency/GIL** (`test_concurrency_perf.py`, informational/walltime): threads {1,4,8} over a **multi-batch**
-  arrow scan / pandas scan / native + arrow UDF. EXCLUDED from the Callgrind sweep (Callgrind serializes threads
-  → its wall-clock contention signal is meaningless there); it is a local walltime tool.
-- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain psutil RSS + object-count
-  ratio test (not a codspeed benchmark) for the object-pinning paths (register/unregister, UDF create/run/remove,
-  executemany). Runs in the normal test suite.
-- **Memory mode (DEFERRED).** A second Callgrind sweep (`--codspeed-mode=memory`) over the O(rows) produce paths
-  for peak-RSS, feeding the same baseline model, is DESIGNED but not implemented this round (roughly doubles the
-  CI cost; nightly-only when added). The `test_mem_df_with_nulls` tracemalloc guard stays as a local signal until
-  then (convert it to an A/B delta when memory mode lands).
-
-### Two code-grounded gotchas
-- **OUT-col null benchmarks need REAL DuckDB nulls** (`CASE WHEN ... THEN NULL`): the masked-array branch only triggers on an actually-invalid validity bit (`array_wrapper.cpp:396-404,736`); a no-null column silently takes the cheap `std::move` path and measures the wrong thing.
-- **IN-numpy string benchmarks need mixed ASCII + non-ASCII + a NaN/pd.NA/None sentinel**: the scan zero-copies compact-ASCII (`numpy_scan.cpp:416-418`) but transcodes otherwise (`numpy_scan.cpp:429-446`); ASCII-only misses the transcode + null-detection ladder.
-
-## (e) Cross-check vs iqmo-org/bareduckdb
-
-Source read live from `iqmo-org/bareduckdb` `main`, subdir `benchmark/` (GitHub API + raw files).
-
-### What their suite covers / how it is organized
-
-A **SQL-file-driven A/B harness comparing two clients** — production `duckdb` vs `bareduckdb` (the C-API / free-threading prototype) — not a binding micro-bench.
-
-- `benchmark.py` orchestrates: discovers `cases/**/*.sql`, picks the matching `data/DATA*` dir, and runs each `(sql x parquet-file x db_mode)` as a fresh `uv run run_case.py` **subprocess**. `DBMODES=[duckdb, bareduckdb_capsule, bareduckdb_arrow]`; active `READ_MODES=[arrow_table]` (parquet/arrow_reader present but off).
-- `run_case.py` per case: fresh `connect()`, `pyarrow.parquet.read_table(file)` + `conn.register(name, table)`, then `conn.sql(query).to_arrow_table()`, timed with `time.perf_counter()` and peak RSS via `resource.getrusage`. **No warmup, single run, result discarded.** Universal ingest = register(arrow table); universal produce = `to_arrow_table()`.
-- `data/`: `DATA_RANGE` = single BIGINT `range(N)` at 5M / 100M; `DATA_CATEGORY_DATE_PRICE` = (VARCHAR category, DATE, BIGINT price) cross-join at 36M / 3.6B.
-- `cases/`: `types/` (decimal `DECIMAL(28,12)`, hugeint `HUGEINT`, mixed_types `HUGEINT+uuid()+DECIMAL(28,6)+VARCHAR` in one row, timestamp `TIMESTAMP+INTERVAL`, varchar_long ~100-char), `limit/` (LIMIT 100 / 1k / 10k / 100k top-N — a result-cardinality sweep), `filter/`, `groups/`, `window/`, `threading/` (parallel group/window/self-join/registered-arrow-scan), plus a separate `stats/` harness.
-
-Their INGEST is arrow-only and their PRODUCE is arrow-only; they have **no** fetchall/fetchone, df()/numpy, pandas/numpy/native/polars ingest, or UDF coverage — so our binding suite is far broader on binding-specific surfaces. Their genuine deltas are concentrated in the PRODUCE/types dimension and in engine/threading workloads.
-
-### DELTA — actionable additions/changes
-
-- **[BINDING] Add HUGEINT to the produce matrix (currently absent).** `types/hugeint.sql`, `mixed_types.sql`. OUT-row `FromValue` HUGEINT does `PyLong_FromString(val.GetValue<string>())` — a per-value string round-trip (`python_objects.cpp:500`), unlike narrow int; OUT-col casts hugeint->double (`array_wrapper.cpp:662`); OUT-arrow is a distinct decimal128/int128 export. Scenario: `SELECT i::HUGEINT FROM range(1_000_000)` through fetchall / df / to_arrow_table. Add a `hugeint` row to the type x direction matrix.
-- **[BINDING] Add UUID to the produce matrix (absent).** `mixed_types.sql` selects `uuid()`. OUT-row builds a Python `uuid.UUID` per row (`python_objects.cpp:708-711`); OUT-col uses `UUIDConvert` (`array_wrapper.cpp:230-244`). Scenario: `SELECT gen_random_uuid() FROM range(200_000)` through fetchall / df / to_arrow_table. Add a `uuid` row to the matrix.
-- **[BINDING] Add a 128-bit-internal DECIMAL variant.** Our P1-20 uses `DECIMAL(18,3)` (int64 internal); bareduckdb uses `DECIMAL(28,12)` / `(28,6)` (int128 internal), hitting `ConvertDecimalInternal<hugeint_t>` (`array_wrapper.cpp:571`) and the wider `PyDecimalCastSwitch`/`Decimal()` round-trip. Run both an int64-internal and an int128-internal decimal.
-- **[BINDING] Add a heterogeneous mixed-type row (new scenario).** `SELECT i::HUGEINT, gen_random_uuid(), (i*1.5)::DECIMAL(28,6), ('string_'||i) FROM range(200_000)` through fetchall and df. Exercises per-cell type dispatch in the `Fetchone` column loop (`pyresult.cpp:140-148`) — a different branch/cache profile than our homogeneous columns (P0-1..3 are single-type).
-- **[BINDING] Add a long-varchar (>64 char) variant** alongside the short `'str_value_'||i`. `'...'||repeat('data ',10)||i::VARCHAR` (~100 chars). Short strings are copy-cheap/overhead-bound; long strings shift OUT-row/OUT-col string copy and the IN-numpy `DecodePythonUnicode` transcode (`numpy_scan.cpp:429-446`) toward copy-bound. Apply to OUT-row, OUT-col, IN-numpy varchar scenarios.
-- **[BINDING] Adopt their result-cardinality (top-N) sweep as a produce axis.** `SELECT * FROM <fixed source> ORDER BY k DESC LIMIT n` for n in {100, 1k, 10k, 100k}, fetched via fetchall / df / to_arrow_table with the source held constant. Holds engine work ~constant while sweeping rows-materialized-to-Python → a clean per-row conversion slope, and the small-n end is an ideal noise-free instruction-count gate (overhead regime). Cleaner than varying `range()` (which also changes scan cost).
-- **[BINDING] Broaden the OUT-arrow column of the matrix.** Their entire produce path is `to_arrow_table`, and they push hugeint / decimal128 / uuid / timestamp / long-varchar / mixed-row through it — exactly the arrow-export converters (ArrowConverter/appender for int128/uuid/decimal128) our OUT-arrow column currently leaves at P1/P2 numeric+string. Add these types to OUT-arrow.
-- **[BINDING, hard to gate] registered-arrow-scan under parallelism.** `threading/registered_arrow_scan.sql` pulls batches from `PythonTableArrowArrayStreamFactory::Produce` (binding code in `arrow/arrow_array_stream.cpp`) across engine threads holding/releasing the GIL — a real binding-contention risk. Keep as walltime-informational only; too noisy for an instruction-count gate.
-- **[ENGINE] `filter` / `groups` / `window` / `self_join` pure-engine workloads** — out of scope for a binding gate; the binding only wraps them with register + to_arrow_table, and their consume (a small aggregate) is trivial so the measurement is ~pure engine. Note, do not add to the binding suite.
-- **[ENGINE] 100M / 3.6B-row scale** — too slow / IO+engine-dominated / walltime-noisy for a codspeed gate; keep our regimes <= ~1M.
-- **[ENGINE] threading / free-threading category** — the production client does not support free-threading (CLAUDE.md); deprioritize for this suite.
-
-### Methodology notes for our codspeed mechanics
-
-- **Adopt: result-cardinality (LIMIT) axis** (above) — a clean per-row conversion-cost slope and a natural small/large pairing for the instruction-count-gate-vs-walltime split already in (d).
-- **Consider adopting: a peak-memory guard** for the O(rows) produce paths. bareduckdb tracks `getrusage` max RSS; codspeed walltime tracks neither memory nor allocations. A conversion regression is often memory-shaped (cf. the recorded fetchall +8% list->tuple edge-copy; the df() masked_array branch) — add a separate `getrusage`/memray delta assertion on `fetchall` and `df()`-with-nulls as a secondary signal, since a pure-timing gate can miss it.
-- **Do NOT adopt their anti-patterns:** no-warmup + single subprocess run charges one-time import-cache population into the measurement and yields no statistics — bad for steady-state binding isolation. Our warmup + codspeed repeated rounds are correct; keep them.
-- **Consistent with us:** their full-consume is eager `to_arrow_table()` and never `count(*)` — matches our discipline. Caveat: for their aggregate cases the arrow output is tiny, so the consume is trivial and the run is engine-only; our produce benchmarks must keep the materialization the heavy part (large output / top-N with large LIMIT).
+| timestamp(tz) | P1 | P1 | **P1** (pytz/row) | P1 | P1 |
+| LIST/STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 |
+| hugeint/uuid | P2 | P2 | **P1** (round-trip) | P1 | P2 |
+| blob/map | P2 | P2 | P2 | P2 | P2 |
+| NULL-heavy | n/a | **P1** | P2 | **P0** (masked_array) | P1 |
+
+## Mechanics
+
+- **Walltime vs instruction-count.** Local A/B is walltime only (no Valgrind on macOS arm64). CI is
+  instruction-count via self-hosted Callgrind (near-deterministic, PYTHONHASHSEED pinned), diffed against a
+  committed baseline. Report-only until trusted.
+- **Marker split + auto-move.** Every benchmark is `gate` or `informational` (see README). At baseline regen,
+  each numeric-produce gate's binding fraction `= 1 - floor_Ir / bench_Ir` is computed against its engine floor
+  (`test_engine_control_perf`); a gate below the ~25% cutoff is auto-moved to informational (a threshold on an
+  engine-diluted total is not meaningful). OUT-row fetch and UDFs are ~all binding; numeric produce is a bulk
+  memcpy of ~engine magnitude (auto-move candidate).
+- **Guards.** compare_baseline.py warns and stops enforcing when BENCH_SCALE, the pin file, or the DuckDB
+  submodule SHA differ from the baseline's (any of those makes the counts non-comparable).
+- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain RSS + object-count test for the
+  object-pinning paths, since a per-call refcount imbalance is invisible to a steady-state benchmark.
+- **Memory mode** (a second Callgrind sweep for O(rows) produce peak-RSS) is designed but deferred; the
+  `test_mem_df_with_nulls` tracemalloc guard is the local stand-in.
+
+## Cross-check vs iqmo-org/bareduckdb
+
+Their suite is a SQL-file-driven A/B comparing two clients (production `duckdb` vs the C-API prototype), arrow-in
+/ arrow-out only, no fetchall/df/numpy/native/UDF coverage. So our binding suite is far broader; their genuine
+deltas concentrate in PRODUCE/types. Actionable additions they suggest:
+
+- **hugeint / uuid in the produce matrix** (they select both): OUT-row does a per-value string round-trip, distinct
+  from narrow int. Now in `test_produce_numpy_perf` / `test_fetch_perf`.
+- **int128-internal decimal** (`DECIMAL(28,x)`) alongside the int64-internal one: hits a wider cast path. Added.
+- **heterogeneous mixed-type row**: exercises per-cell type dispatch in the Fetchone loop, unlike homogeneous
+  columns. Added as `test_fetchall_mixed_wide`.
+- **long varchar (>64 char)** alongside the short string: shifts string copy / transcode toward copy-bound. Added
+  as `varchar_long` in the matrix.
+- **result-cardinality (top-N) sweep**: holds engine work ~constant while sweeping rows-to-Python. Adopted as
+  `test_cardinality_perf` (plain LIMIT, no ORDER BY; the sort swamped the signal).
+- **peak-memory guard** on the O(rows) produce paths: a conversion regression is often memory-shaped. Partially
+  covered by the tracemalloc guard; full coverage waits on memory mode.
+
+Out of scope (theirs, not adopted): pure-engine filter/group/window workloads; 100M+ row scale (IO/engine
+dominated); the free-threading category (unsupported by this client). Do NOT adopt their no-warmup single-run
+methodology (charges import-cache population into the measurement).
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..ca8f8355
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,34 @@
+# Benchmark suite
+
+CodSpeed micro-benchmarks for the binding hot paths (produce, ingest, UDF).
+Design rationale: [PLAN.md](PLAN.md). CI: [../.github/workflows/codspeed.yml](../.github/workflows/codspeed.yml).
+
+## Markers
+
+Every benchmark carries exactly one (registered in `conftest.py`):
+
+- **gate**: binding-dominated, GIL-held, deterministic under Callgrind. A threshold breach is a binding regression.
+- **informational**: engine/library/streaming-diluted. Reported, never gated (would false-positive on engine bumps).
+
+## Local A/B (walltime)
+
+Only walltime runs locally (no Valgrind on macOS arm64; instruction-count gating is Linux/CI-only, and walltime is
+noisy on sub-ms benches). Pin the data libs identically across both builds so the delta is pure binding:
+
+```bash
+for P in ../main/.venv-release/bin/python .venv-release/bin/python; do
+  $P -m pytest benchmarks/<module>.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+done
+```
+
+## Conventions
+
+- READ aggregates real columns (`sum`/`length`), never `count(*)` (answered from metadata).
+- WRITE fully materializes the result or drains the lazy reader.
+- Warm once before measuring.
+- `con` fixture pins `threads=1` (see `conftest.py`).
+
+Two traps (a benchmark that skips these silently measures the wrong thing):
+
+- OUT-col null benches need REAL nulls (`CASE WHEN ... THEN NULL`), else the cheap `std::move` path is taken.
+- IN-numpy string benches need mixed ASCII + non-ASCII + a null sentinel, else the transcode/null ladder is skipped.
diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py
index b641662f..a4049aa9 100644
--- a/benchmarks/_scale.py
+++ b/benchmarks/_scale.py
@@ -1,17 +1,12 @@
-"""Env-gated row-count scaling for the benchmark suite (INFRA-4).
-
-Callgrind is 20-50x, and the O(rows) / per-row-object benchmarks at full N make the CI sweep too slow. `scaled(n)`
-shrinks those row counts ONLY when an explicit `BENCH_SCALE=<divisor>` env var is set (which the CI Callgrind
-sweep sets). Unset -> full N, so LOCAL walltime A/B keeps the large N unchanged.
-
-CRITICAL: a gate benchmark and the engine-control floor it is compared against (the FLOOR_MAP pairs in
-compare_baseline.py) share the same base N, so routing BOTH through `scaled()` keeps them at an identical scaled
-N -- the Option-B binding_fraction stays valid. Scaling ONLY reduces row counts; it must never change the data
-patterns the benchmarks depend on (real NULLs, mixed ASCII+non-ASCII+null, LIMIT-no-ORDER-BY, warm-before-measure).
-
-A floor keeps a scaled benchmark row-dominated (well above the range(2048) fixed-cost probes), so per-element
-work still dominates and the fraction/signal stay meaningful. The small-N `*_gate` probes are NOT routed through
-this (they are already fast and are the fixed-cost baseline).
+"""Env-gated row-count scaling for the benchmark suite.
+
+Callgrind is 20-50x, so the O(rows) benches at full N make the CI sweep too slow. `scaled(n)` shrinks row counts
+ONLY when `BENCH_SCALE=<divisor>` is set (which the CI sweep sets); unset -> full N, so local walltime A/B is
+unchanged. A gate bench and the engine floor it is compared against share a base N, so routing BOTH through
+`scaled()` keeps them at an identical scaled N and the binding fraction stays valid. Scaling reduces row counts
+only; it must never change the data patterns the benches depend on (real nulls, mixed ASCII, LIMIT-no-ORDER-BY).
+A floor keeps a scaled bench row-dominated so per-element work still dominates; the small-N `*_gate` probes are
+already fast and are NOT scaled.
 """
 
 from __future__ import annotations
diff --git a/benchmarks/compare_baseline.py b/benchmarks/compare_baseline.py
index 85e96c43..ab9773f9 100644
--- a/benchmarks/compare_baseline.py
+++ b/benchmarks/compare_baseline.py
@@ -1,27 +1,17 @@
 #!/usr/bin/env python3
-"""Committed-baseline instruction-count comparison for the CodSpeed benchmark suite.
-
-WHY / HOW (grounded, verified on a Linux+valgrind box):
-  The suite runs under `valgrind --tool=callgrind` with pytest-codspeed. pytest-codspeed's hooks call
-  `callgrind_dump_stats_at(<uri>)` at the end of each benchmark, so callgrind writes ONE dump file per
-  benchmark, headed by `desc: Trigger: Client Request: <uri>` with the instruction count on the `totals:`
-  line (`events: Ir`). The hooks also obj-skip libpython, so counts are clean. NO CodSpeed account, token, or
-  runner binary is involved -- this parses the raw callgrind dumps directly.
-
-  Observed run-to-run noise on that box was ~0.1% (callgrind is near-deterministic, not bit-identical), so the
-  default gate threshold (5%) sits far above noise. PYTHONHASHSEED is pinned in CI to keep dict/struct paths
-  stable.
-
-TWO MODES:
-  regen   -- build benchmarks/baseline.json from a fresh valgrind run: per-benchmark instruction counts +
-             provenance meta + (for the mapped numeric-produce gates) the engine-diluted binding fraction, and
-             the Option-B auto-move of any gate below the cutoff to `informational`.
-  compare -- parse a fresh valgrind run, diff each benchmark against baseline.json, and print a report. GATE
-             benchmarks over their threshold are regressions; `informational` benchmarks are reported only.
-             REPORT-ONLY by default (always exit 0); `--enforce` exits non-zero on a gate regression.
-
-Both are CI-only in practice (no valgrind on macOS arm64). baseline.json and benchmarks/requirements-bench.txt
-are regenerated together (same job) so the counts always correspond to the frozen data-lib pins.
+"""Committed-baseline instruction-count comparison for the benchmark suite. See benchmarks/README.md.
+
+pytest-codspeed's hooks call `callgrind_dump_stats_at(<uri>)` per benchmark, so callgrind writes ONE dump each,
+headed by `desc: Trigger: Client Request: <uri>` with the count on `totals:` (`events: Ir`). This parses those
+raw dumps directly (no CodSpeed account/token/runner). Run-to-run noise is ~0.1%, so the 5% gate threshold sits
+far above it (PYTHONHASHSEED pinned in CI).
+
+Two modes (CI-only; no valgrind on macOS arm64):
+  regen:   write baseline.json from a fresh run: counts + provenance + binding fractions + auto-move.
+  compare: diff a fresh run against baseline.json. Gate benches over threshold are regressions; informational
+           are reported only. Report-only by default; `--enforce` exits non-zero on a gate regression.
+
+baseline.json and benchmarks/requirements-bench.txt are regenerated together so counts match the frozen pins.
 """
 
 from __future__ import annotations
@@ -37,14 +27,13 @@
 
 SCHEMA_VERSION = 1
 GATE_DEFAULT_THRESHOLD_PCT = 5.0
-BINDING_FRACTION_CUTOFF = 0.25  # Option-B: a gate whose isolable binding fraction is below this is auto-moved
-#                                 to informational (a threshold on its engine-diluted total is not meaningful).
-
-# Option-B floor map: the engine-control benchmark whose instruction count is the "engine floor" of a given
-# numeric-produce gate. binding_fraction = 1 - floor_Ir / bench_Ir. ONLY the numeric-produce benches are listed:
-# MEAS-1 showed their per-element binding is a bulk memcpy (~engine magnitude); every other gate (OUT-row fetch
-# of any type, string/nested/decimal/hugeint/uuid produce, UDFs, native ingest, analyzer bind) is high-binding
-# and needs no fraction. Add a mapping (and, if needed, an engine floor) here to evaluate more benches.
+BINDING_FRACTION_CUTOFF = 0.25  # a gate whose isolable binding fraction is below this is auto-moved to
+#                                 informational (a threshold on its engine-diluted total is not meaningful).
+
+# Floor map: the engine-control bench that is the "engine floor" of a numeric-produce gate.
+# binding_fraction = 1 - floor_Ir / bench_Ir. ONLY numeric-produce benches are listed (their per-element binding
+# is a bulk memcpy of ~engine magnitude); every other gate is high-binding and needs no fraction. Add a mapping
+# (and, if needed, a floor) to evaluate more benches.
 _E = "benchmarks/test_engine_control_perf.py"
 FLOOR_MAP = {
     "benchmarks/test_produce_numpy_perf.py::test_df_numeric": f"{_E}::test_engine_sum_2col_500k",
@@ -65,11 +54,7 @@
 
 
 def _normalize_uri(raw: str) -> str:
-    """Return a repo-relative benchmark key.
-
-    Inside a git repo pytest-codspeed already emits a git-relative uri (e.g. `benchmarks/x.py::test[p]`); this
-    defensively strips a leading absolute path if the run happened outside a git repo.
-    """
+    """Return a repo-relative benchmark key (strip a leading absolute path if the run was outside a git repo)."""
     raw = raw.strip()
     if "::" not in raw:
         return raw
@@ -83,8 +68,8 @@ def _normalize_uri(raw: str) -> str:
 def parse_profiles(profile_dir: Path) -> dict[str, int]:
     """Parse every callgrind dump in `profile_dir`; return {benchmark_uri: instruction_count}.
 
-    Only dumps whose Trigger is a benchmark Client Request (contains `::`) are kept; the metadata and
-    program-termination dumps are skipped. If a uri appears more than once (should not happen) the max is kept.
+    Keeps only dumps whose Trigger is a benchmark Client Request (contains `::`); skips metadata/termination
+    dumps. If a uri appears more than once (should not happen) the max is kept.
     """
     counts: dict[str, int] = {}
     files = sorted(profile_dir.rglob("*")) if profile_dir.exists() else []
@@ -236,9 +221,8 @@ def compare(args: argparse.Namespace) -> int:
                 "may not be pure binding. Regenerate the baseline with the current pins."
             )
 
-    # engine-bump guard: engine-inclusive counts shift when the bundled DuckDB submodule changes, for reasons
-    # unrelated to the binding. If the current submodule SHA differs from the baseline's, do not treat gate
-    # deltas as hard failures (they may reflect the engine bump); warn to regenerate the baseline.
+    # engine-bump guard: engine-inclusive counts shift when the DuckDB submodule changes. If the SHA differs from
+    # the baseline's, don't treat gate deltas as hard failures (they may reflect the bump); warn to regenerate.
     engine_changed = bool(
         args.submodule_sha and meta.get("duckdb_submodule_sha") and args.submodule_sha != meta["duckdb_submodule_sha"]
     )
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index b1ccd604..07cba4e4 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -1,22 +1,8 @@
-"""Shared fixtures + marker registration for the CodSpeed benchmark suite.
-
-Central home (INFRA-6) for the `con` fixture, the `threads=1` isolation default, and the gate/informational
-marker registration (INFRA-1). Markers are registered HERE (not via pyproject `markers=`) to keep the suite
-self-contained. Registration is REQUIRED: pyproject sets `filterwarnings = ["error"]`, so an unregistered
-mark would raise `PytestUnknownMarkWarning` as a collection error.
-
-Marker semantics
-  gate          Binding-dominated, GIL-held, deterministic under Callgrind (instruction-count). These are the
-                paths where a threshold breach means a *binding* regression. Gate-able. (Enforcement against a
-                committed baseline is a later phase; for now they run and report.)
-  informational Engine/parallel/IO/library-diluted, streaming drains, or arrow-export re-run paths. Reported,
-                never gated: their instruction count is dominated by non-binding work (engine aggregate, the
-                bundled DuckDB submodule, pyarrow/polars library code), so gating them would false-positive on
-                engine/submodule bumps rather than catch binding regressions.
-
-Every benchmark (a test using the `benchmark` fixture) must carry EXACTLY ONE of these markers so the two CI
-steps (`-m gate`, `-m informational`) together cover the suite with no overlap. Non-benchmark guards (e.g. the
-tracemalloc assertion in test_produce_numpy_perf.py) are intentionally left unmarked and run in neither step.
+"""Shared fixtures + marker registration for the benchmark suite. See benchmarks/README.md.
+
+Markers are registered here (not via pyproject `markers=`) because pyproject sets `filterwarnings = ["error"]`,
+so an unregistered mark would raise as a collection error. Every benchmark must carry EXACTLY ONE of `gate` /
+`informational` so the two CI steps (`-m gate`, `-m informational`) cover the suite with no overlap.
 """
 
 from __future__ import annotations
@@ -24,7 +10,7 @@
 from typing import TYPE_CHECKING
 
 import pytest
-from _scale import bench_scale, scaled  # noqa: F401  (re-exported here as the shared home; used by the modules)
+from _scale import bench_scale, scaled  # noqa: F401  (re-exported as the shared home; used by the modules)
 
 import duckdb
 
@@ -32,13 +18,6 @@
     from collections.abc import Iterator
 
 
-# ENV-GATED ROW COUNTS (INFRA-4): the O(rows) / per-row-object benchmarks route their N through `scaled()`
-# (benchmarks/_scale.py). Unset `BENCH_SCALE` -> full N (local walltime A/B is unchanged); the CI Callgrind
-# sweep sets `BENCH_SCALE=<divisor>` to shrink N so the sweep fits under the job timeout. A gate benchmark and
-# its engine-control floor (FLOOR_MAP in compare_baseline.py) share a base N, so both scale identically and the
-# Option-B binding fraction stays valid. Scaling changes ONLY row counts, never the Do-NOT-regress data patterns.
-
-
 def pytest_configure(config: pytest.Config) -> None:
     """Register the gate/informational markers (required under filterwarnings=error)."""
     config.addinivalue_line(
@@ -55,8 +34,8 @@ def pytest_configure(config: pytest.Config) -> None:
 def con() -> Iterator[duckdb.DuckDBPyConnection]:
     """Yield a fresh single-threaded connection, closed on teardown.
 
-    `threads=1` pins engine parallelism so per-run instruction counts and walltime do not shift with the CI
-    runner core count (INFRA-6). The concurrency module (COV-1, a later phase) overrides this deliberately.
+    `threads=1` pins engine parallelism so counts/walltime don't shift with the runner core count. The
+    concurrency module overrides this deliberately.
     """
     c = duckdb.connect(config={"threads": 1})
     yield c
diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt
index e230d715..8a9f49e6 100644
--- a/benchmarks/requirements-bench.txt
+++ b/benchmarks/requirements-bench.txt
@@ -1,15 +1,9 @@
-# Frozen, exact pins for the CodSpeed benchmark suite (.github/workflows/codspeed.yml).
-#
-# WHY a committed pin file (and NOT the gitignored uv.lock, and NOT a re-resolving `>=` group): CodSpeed
-# compares instruction counts across runs. If a data lib (numpy/pandas/pyarrow/polars) changed version between
-# the baseline run and a later run, that delta would be misattributed to the binding. These pins freeze the data
-# libs so the ONLY cross-run delta is the binding. Regenerate this file DELIBERATELY, together with the baseline.
-#
-# SOURCE OF TRUTH: the human-readable `[dependency-groups] bench` list in pyproject.toml. Regenerate with:
+# Frozen pins for the benchmark suite: freezing the data libs means the only cross-run delta is the binding.
+# Regenerate DELIBERATELY, together with the baseline. Source of truth: the `[dependency-groups] bench` list in
+# pyproject.toml (torch/tensorflow deliberately absent, local-only via importorskip). Regenerate with:
 #   uv pip compile pyproject.toml --group bench \
 #     --python-version 3.13 --python-platform x86_64-unknown-linux-gnu \
 #     --no-annotate --no-header -o benchmarks/requirements-bench.txt
-# (py3.13 / linux-x86_64 is the CI target.) torch/tensorflow are deliberately absent (local-only via importorskip).
 iniconfig==2.3.0
 markdown-it-py==4.2.0
 mdurl==0.1.2
diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py
index 0fd8461f..de05f78e 100644
--- a/benchmarks/test_arrow_perf.py
+++ b/benchmarks/test_arrow_perf.py
@@ -1,14 +1,6 @@
-"""CodSpeed benchmark: Arrow read/write paths. Standalone, not in CI.
+"""Arrow read/write: Table + RecordBatchReader + dictionary sweep. See benchmarks/README.md.
 
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_arrow_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-Data must be fully moved or nothing is measured: READ aggregates over real columns (sum/length, not count(*),
-which arrow answers from metadata); WRITE materializes the result (to_arrow_reader is lazy, so it is drained).
+READ aggregates over real columns (arrow answers count(*) from metadata); WRITE drains the lazy reader.
 """
 
 from __future__ import annotations
@@ -26,22 +18,18 @@
 
     import duckdb
 
-N = scaled(500_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
-DICT_UNIQUE = [2, 1_000, 50_000]  # cardinality sweep: UNIQUE-value counts (not row counts) -> NOT scaled
+N = scaled(500_000)
+DICT_UNIQUE = [2, 1_000, 50_000]  # UNIQUE-value counts (cardinality sweep), not row counts -> NOT scaled
 WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
 WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
 
-# informational: every benchmark here is engine-parallel or library/streaming dominated -> reported, not gated.
-#   READ (sum over registered arrow) -> engine aggregate dominates; the near-zero-copy scan is a small fraction.
-#   WRITE to_arrow_table/to_arrow_reader/pl() -> PromoteMaterializedToArrow re-runs the query GIL-released
-#   (engine-parallel), and pl() also runs polars library code. Their counts would trip on engine/submodule
-#   bumps, not binding regressions. `con` fixture + threads=1 live in conftest.py.
+# informational: every bench here is engine-parallel or library/streaming dominated. READ = engine aggregate
+# dominates; WRITE (to_arrow/pl) re-runs the query GIL-released. Would trip on engine/submodule bumps, not binding.
 pytestmark = pytest.mark.informational
 
 
 @pytest.fixture(scope="module")
 def arrow_numeric() -> pa.Table:
-    """Return a two-column numeric arrow table."""
     return pa.table(
         {
             "a": pa.array(range(N), type=pa.int64()),
@@ -52,20 +40,17 @@ def arrow_numeric() -> pa.Table:
 
 @pytest.fixture(scope="module")
 def arrow_string() -> pa.Table:
-    """Return a single-column string arrow table."""
     return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())})
 
 
 @pytest.fixture(scope="module")
 def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.RecordBatch]]:
-    """Return the schema and record batches for the numeric table."""
     # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round
     return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000)
 
 
 @pytest.fixture(scope="module")
 def arrow_dict_tables() -> dict[int, pa.Table]:
-    """Return dictionary-encoded arrow tables keyed by number of unique values (a cardinality sweep)."""
     # deterministic indices (i % U) so the instruction count is reproducible (no PRNG)
     tables = {}
     for u in DICT_UNIQUE:
@@ -75,38 +60,29 @@ def arrow_dict_tables() -> dict[int, pa.Table]:
     return tables
 
 
-# --------------------------------------------------------------------------- #
-# READ: arrow -> duckdb. The engine must scan every value (sum/length force it).
-# --------------------------------------------------------------------------- #
+# READ: arrow -> duckdb. sum/length force a full scan.
 
 
 def test_read_arrow_numeric(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric: pa.Table
 ) -> None:
-    """Benchmark scanning a numeric arrow table."""
     con.register("t_num", arrow_numeric)
-    con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall())
 
 
 def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None:
-    """Benchmark scanning a string arrow table."""
     con.register("t_str", arrow_string)
-    con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall())
 
 
-# ADDED: RecordBatchReader ingest -- the SAME PythonTableArrowArrayStreamFactory but STREAMING (distinct from
-# the materialized Table read above). A fresh reader is built per round (the engine drains it); sum() forces a
-# full scan of every value.
-
-
 def test_read_arrow_reader_numeric(
     benchmark: BenchmarkFixture,
     con: duckdb.DuckDBPyConnection,
     arrow_numeric_batches: tuple[pa.Schema, list[pa.RecordBatch]],
 ) -> None:
-    """Benchmark scanning a streaming record-batch reader."""
+    # same factory as the Table read, but STREAMING: a fresh reader per round, drained by the engine
     schema, batches = arrow_numeric_batches
 
     def run() -> list:
@@ -118,43 +94,32 @@ def run() -> list:
     benchmark(run)
 
 
-# ADDED (COV-4): dictionary-encoded arrow ingest, cardinality sweep (unique in {2, 1k, high}). Mirrors core's
-# test_arrow_dictionaries_scan. The engine aggregate dominates (hence informational), but the per-value
-# dictionary DECODE in the arrow scan is the binding interest, and its cost slopes with the unique count.
-
-
 @pytest.mark.parametrize("unique", DICT_UNIQUE)
 def test_read_arrow_dictionary(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_dict_tables: dict[int, pa.Table], unique: int
 ) -> None:
-    """Benchmark scanning a dictionary-encoded arrow column at a given cardinality."""
+    # per-value dictionary DECODE cost slopes with the unique count (mirrors core test_arrow_dictionaries_scan)
     con.register("t_dict", arrow_dict_tables[unique])
     con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall())
 
 
-# --------------------------------------------------------------------------- #
-# WRITE: duckdb -> arrow, consumer fully materializes / fully drains the stream.
-# --------------------------------------------------------------------------- #
+# WRITE: duckdb -> arrow, consumer fully materializes / drains the stream.
 
 
 def test_write_arrow_table_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a numeric result to an arrow table."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table())
 
 
 def test_write_arrow_table_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a string result to an arrow table."""
     benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table())
 
 
 def test_write_arrow_reader_consumed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark draining a lazy arrow record-batch reader."""
-
     def run() -> int:
         reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000)
         rows = 0
-        for batch in reader:  # drain the lazy stream so duckdb actually produces every batch
+        for batch in reader:  # drain the lazy stream so duckdb produces every batch
             rows += batch.num_rows
         return rows
 
@@ -162,10 +127,8 @@ def run() -> int:
 
 
 def test_write_polars_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a numeric result to a polars frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).pl())
 
 
 def test_write_polars_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a string result to a polars frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).pl())
diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py
index 6e7af136..751c6cf5 100644
--- a/benchmarks/test_cardinality_perf.py
+++ b/benchmarks/test_cardinality_perf.py
@@ -1,16 +1,8 @@
-"""CodSpeed benchmark: the result-cardinality (rows-to-Python) sweep. Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_cardinality_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-Sweeps `SELECT * FROM src LIMIT n` (no ORDER BY) over a pre-materialized 3-column source: a plain LIMIT
-early-stops the scan, so the per-row conversion dominates and the slope is monotone in n. A steeper slope on
-one build is a per-row conversion regression. n=100 is the overhead regime, n=100_000 is throughput.
-(An earlier ORDER BY version was dropped: the top-N sort swamped the signal.)
+"""Result-cardinality (rows-to-Python) sweep via LIMIT n, no ORDER BY. See benchmarks/README.md.
+
+`SELECT * FROM src LIMIT n` early-stops the scan, so per-row conversion dominates and the slope is monotone in n.
+A steeper slope on one build is a per-row conversion regression. n=100 is overhead, n=100_000 is throughput.
+(An ORDER BY version was dropped: the top-N sort swamped the signal.)
 """
 
 from __future__ import annotations
@@ -27,18 +19,15 @@
 
     from pytest_codspeed import BenchmarkFixture
 
-# env-gated (INFRA-4): scale the source rows AND the top-N of the sweep by the same factor, keeping the small-N
-# points fixed and SRC_ROWS >= max(LIMITS). Preserves the LIMIT-no-ORDER-BY early-stop pattern (Do-NOT-regress).
+# scale the source rows AND the top-N by the same factor, keeping small-N points fixed and SRC_ROWS >= max(LIMITS).
 SRC_ROWS = scaled(200_000)
 LIMITS = [100, 1_000, 10_000, scaled(100_000)]
 
 
 @pytest.fixture(scope="module")
 def con() -> Iterator[duckdb.DuckDBPyConnection]:
-    """Yield a connection over a once-materialized source table."""
-    # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be
-    # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows.
-    c = duckdb.connect(config={"threads": 1})  # pin engine parallelism (INFRA-6); module-scoped source table
+    # source materialized ONCE (module-scoped) and identical across the n sweep; per-test build would add noise
+    c = duckdb.connect(config={"threads": 1})
     c.execute(
         "CREATE TABLE src AS "
         f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)"
@@ -48,15 +37,12 @@ def con() -> Iterator[duckdb.DuckDBPyConnection]:
 
 
 def _query(n: int) -> str:
-    # No ORDER BY: a plain LIMIT early-stops the scan at n rows -> engine cost cheap and monotone in n, so the
-    # per-row binding conversion dominates the n-varying signal (unlike the old ORDER BY top-N sort).
     return f"SELECT a, b, s FROM src LIMIT {n}"
 
 
-@pytest.mark.gate  # fetchall materializes n rows to Python -> binding-dominated; small-n end is the noise-free gate
+@pytest.mark.gate  # fetchall materializes n rows -> binding-dominated; small-n end is the noise-free gate
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
-    """Benchmark fetchall over a LIMIT n sweep."""
     q = _query(n)
     con.execute(q).fetchall()  # warm
     benchmark(lambda: con.execute(q).fetchall())
@@ -65,7 +51,6 @@ def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnect
 @pytest.mark.gate  # df() materializes n rows to numpy columns -> binding-dominated
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
-    """Benchmark df() over a LIMIT n sweep."""
     q = _query(n)
     con.sql(q).df()  # warm
     benchmark(lambda: con.sql(q).df())
@@ -74,7 +59,6 @@ def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n
 @pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated
 @pytest.mark.parametrize("n", LIMITS)
 def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
-    """Benchmark to_arrow_table() over a LIMIT n sweep."""
     q = _query(n)
     con.sql(q).to_arrow_table()  # warm
     benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_concurrency_perf.py b/benchmarks/test_concurrency_perf.py
index 8be28619..c55b0274 100644
--- a/benchmarks/test_concurrency_perf.py
+++ b/benchmarks/test_concurrency_perf.py
@@ -1,30 +1,16 @@
-"""CodSpeed benchmark: concurrency / GIL pressure (COV-1). informational / WALLTIME. Standalone, not gated.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_concurrency_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-This is the ONE dimension the rest of the suite (single-threaded) cannot see: Python objects threading through
-PARALLEL core execution. It varies `SET threads` and measures the binding under parallel scan / parallel UDF
-invocation. All benchmarks are `informational` and their PRIMARY signal is LOCAL WALLTIME:
-  * scan benches           -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup.
-  * native UDF             -> ~flat scaling = the GIL tax on per-row Python calls (the engine scan is parallel
-                              but the GIL serializes the calls).
-  * arrow (vectorized) UDF -> observed NEGATIVE scaling (slower with more threads): per-chunk convert + GIL
-                              contention. A regression here would deepen the negative slope.
-
-Under the CI `-m informational` step these run in `simulation` (Callgrind), which SERIALIZES threads -- so the
-wall-clock contention is NOT visible there; instead the deterministic instruction count captures the per-batch
-Produce GIL calls and the UDF dispatch overhead. Never gated either way.
-
-GOTCHA (verified locally, mirrors the suite's other "measure the right thing" traps): a SINGLE-BATCH arrow table
-does NOT parallelize (one batch = one serial scan unit; flat across threads). The arrow scan bench MUST use a
-MULTI-BATCH table (`from_batches` with a modest chunksize) or it silently measures a serial scan. A CPU-heavy
-aggregate is also required: a cheap sum is memory-bandwidth-bound and will not parallelize, so there is nothing
-to contend on.
+"""Concurrency / GIL pressure across thread counts. Walltime-only, never gated. See benchmarks/README.md.
+
+The ONE dimension the single-threaded rest of the suite cannot see: Python objects threading through PARALLEL
+core execution. Primary signal is LOCAL WALLTIME:
+  * scan benches  -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup.
+  * native UDF    -> ~flat scaling = the GIL tax on per-row Python calls.
+  * arrow UDF     -> observed NEGATIVE scaling (per-chunk convert + GIL contention).
+
+Under CI Callgrind threads are serialized, so wall-clock contention is invisible there; the deterministic count
+still captures per-batch Produce GIL calls + UDF dispatch. Never gated either way.
+
+GOTCHA: a SINGLE-BATCH arrow table does NOT parallelize (one batch = one serial scan unit). The arrow scan bench
+MUST use a MULTI-BATCH table AND a CPU-heavy aggregate (a cheap sum is bandwidth-bound and won't parallelize).
 """
 
 from __future__ import annotations
@@ -44,23 +30,21 @@
 import numpy as np  # noqa: E402  (after importorskip, matching the suite convention)
 import pandas as pd  # noqa: E402
 
-# informational: concurrency benchmarks are never gated (walltime-noisy; under Callgrind, thread-serialized).
 pytestmark = pytest.mark.informational
 
 N_SCAN = 1_000_000
-BATCH = 20_000  # -> 50 record batches; MULTI-BATCH is required for the arrow scan to parallelize (see GOTCHA)
+BATCH = 20_000  # -> 50 record batches; MULTI-BATCH required for the arrow scan to parallelize (see GOTCHA)
 N_UDF_NATIVE = 200_000  # native UDF = one Python call per row; keep modest (Callgrind instruments every call)
 N_UDF_ARROW = 1_000_000  # arrow UDF = one call per chunk (vectorized)
 THREADS = [1, 4, 8]
 
-# CPU-heavy aggregate so the parallel scan actually engages worker threads (a cheap sum is bandwidth-bound and
-# would not parallelize -> no contention to measure). The binding signal is the per-batch Produce GIL handoff.
+# CPU-heavy aggregate so the parallel scan engages worker threads. The binding signal is the per-batch Produce
+# GIL handoff.
 HEAVY = "sin(a) * cos(b) + sqrt(abs(a)) + ln(abs(a) + 1)"
 
 
 @pytest.fixture(scope="module")
 def arrow_multibatch() -> pa.Table:
-    """Return a MULTI-batch arrow table (single-batch would scan serially -- see module GOTCHA)."""
     a = pa.array(np.arange(N_SCAN), type=pa.int64())
     b = pa.array(np.arange(N_SCAN, dtype="float64") * 1.5, type=pa.float64())
     return pa.Table.from_batches(pa.table({"a": a, "b": b}).to_batches(max_chunksize=BATCH))
@@ -68,19 +52,15 @@ def arrow_multibatch() -> pa.Table:
 
 @pytest.fixture(scope="module")
 def pandas_frame() -> pd.DataFrame:
-    """Return a numpy-backed pandas frame (its scan parallelizes across worker threads)."""
     return pd.DataFrame({"a": np.arange(N_SCAN), "b": np.arange(N_SCAN, dtype="float64") * 1.5})
 
 
-# --------------------------------------------------------------------------- #
-# Parallel SCAN: Python objects (arrow batches / pandas chunks) pulled through the binding by engine worker
-# threads under a CPU-heavy aggregate. The scan Produce acquires/releases the GIL per batch across threads.
-# --------------------------------------------------------------------------- #
+# Parallel SCAN: arrow batches / pandas chunks pulled through the binding by engine worker threads; the scan
+# Produce acquires/releases the GIL per batch across threads.
 
 
 @pytest.mark.parametrize("threads", THREADS)
 def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.Table, threads: int) -> None:
-    """Benchmark a parallel aggregate pulling arrow batches across threads."""
     con = duckdb.connect(config={"threads": threads})
     try:
         con.register("t", arrow_multibatch)
@@ -93,7 +73,6 @@ def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.T
 
 @pytest.mark.parametrize("threads", THREADS)
 def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.DataFrame, threads: int) -> None:
-    """Benchmark a parallel aggregate pulling pandas chunks across threads."""
     con = duckdb.connect(config={"threads": threads})
     try:
         con.register("t", pandas_frame)
@@ -104,15 +83,12 @@ def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.Data
         con.close()
 
 
-# --------------------------------------------------------------------------- #
 # Parallel UDF: the engine scans a MATERIALIZED table (range() does not parallelize) and invokes a Python UDF
-# from multiple worker threads. Native = per-row Python call under the GIL (GIL tax); arrow = per-chunk convert.
-# --------------------------------------------------------------------------- #
+# from multiple worker threads. Native = per-row call under the GIL (GIL tax); arrow = per-chunk convert.
 
 
 @pytest.mark.parametrize("threads", THREADS)
 def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
-    """Benchmark a native Python UDF invoked from parallel worker threads (GIL tax)."""
     con = duckdb.connect(config={"threads": threads})
     try:
         con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_NATIVE}) s(i)")  # materialized -> parallel scan
@@ -125,7 +101,6 @@ def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
 
 @pytest.mark.parametrize("threads", THREADS)
 def test_udf_arrow_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
-    """Benchmark a vectorized arrow UDF invoked from parallel worker threads."""
     con = duckdb.connect(config={"threads": threads})
     try:
         con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_ARROW}) s(i)")  # materialized -> parallel scan
diff --git a/benchmarks/test_engine_control_perf.py b/benchmarks/test_engine_control_perf.py
index febd7ba4..faee4de4 100644
--- a/benchmarks/test_engine_control_perf.py
+++ b/benchmarks/test_engine_control_perf.py
@@ -1,17 +1,8 @@
-"""CodSpeed benchmark: pure-ENGINE control (no Python egress). Standalone, not in CI's binding gate.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_engine_control_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-These `SELECT sum(...) FROM range(N)` queries aggregate to a single scalar, so the fetchall of the result is
-negligible: they measure SQL compile + the engine aggregate with (almost) ZERO per-row Python egress. They are
-the "engine floor" reference for MEAS-1: comparing a produce/fetch/ingest benchmark against the matching-N floor
-here quantifies how much of that benchmark's cost is the binding vs the engine. They are `informational` (they
-measure the engine, not the binding, so they must never gate).
+"""Pure-engine floor (no Python egress): the binding-fraction reference. See benchmarks/README.md.
+
+`SELECT sum(...) FROM range(N)` aggregates to one scalar, so the fetch is negligible: these measure SQL compile +
+the engine aggregate with ~zero per-row egress. Comparing a produce/fetch bench against the matching-N floor here
+quantifies how much of its cost is binding vs engine. Informational (they measure the engine), never gated.
 """
 
 from __future__ import annotations
@@ -26,19 +17,14 @@
 
     import duckdb
 
-# informational: pure-engine reference, never gated. `con` fixture + threads=1 live in conftest.py.
 pytestmark = pytest.mark.informational
 
-# Matched to the N of the fetch/produce/ingest/udf benchmarks so the floors line up for MEAS-1 subtraction and,
-# at baseline regen, for the Option-B binding-fraction of the numeric-produce gates (see compare_baseline.py).
-# CRITICAL: these floors go through scaled() with the SAME base N as the benchmarks they floor, so under
-# BENCH_SCALE the floor and its benchmark stay at an identical N and the fraction stays valid. The 2048 small-N
-# floor is NOT scaled (it is the fixed-cost baseline for the *_gate probes).
-Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)"  # small-N gate floor (compile-dominated), NOT scaled
+# N matched to the benches these floor, and routed through scaled() with the SAME base N, so the floor and its
+# bench stay at an identical scaled N and the binding fraction stays valid. The 2048 small-N floor is NOT scaled.
+Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)"  # small-N gate floor (compile-dominated)
 Q_1C_100K = f"SELECT sum(i::BIGINT) FROM range({scaled(100_000)}) t(i)"  # types-matrix numeric-df floor
 Q_1C_200K = f"SELECT sum(i::BIGINT) FROM range({scaled(200_000)}) t(i)"  # fetch / native-UDF floor
-# produce/ingest floor
-Q_2C_500K = (
+Q_2C_500K = (  # produce/ingest floor
     f"SELECT sum(a), sum(b) FROM (SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({scaled(500_000)}) t(i))"
 )
 
@@ -49,20 +35,16 @@ def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: s
 
 
 def test_engine_sum_1col_small(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Engine floor: compile + sum over range(2048), no egress."""
     _bench(benchmark, con, Q_1C_SMALL)
 
 
 def test_engine_sum_1col_100k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Engine floor: compile + sum over range(100k), no egress."""
     _bench(benchmark, con, Q_1C_100K)
 
 
 def test_engine_sum_1col_200k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Engine floor: compile + sum over range(200k), no egress."""
     _bench(benchmark, con, Q_1C_200K)
 
 
 def test_engine_sum_2col_500k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Engine floor: compile + 2-col sum over range(500k), no egress."""
     _bench(benchmark, con, Q_2C_500K)
diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py
index 9820db6d..1aa5f4fe 100644
--- a/benchmarks/test_fetch_perf.py
+++ b/benchmarks/test_fetch_perf.py
@@ -1,15 +1,4 @@
-"""CodSpeed benchmark: row fetch paths (fetchall, fetchone iteration, expression construction). Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_fetch_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-Only walltime works locally (no Valgrind on macOS arm64); the deterministic instruction-count mode needs Linux (CI).
-Walltime is noisy on sub-ms benchmarks.
-"""
+"""OUT-row fetch: fetchall, fetchone/fetchmany loops, wide/expensive scalar types. See benchmarks/README.md."""
 
 from __future__ import annotations
 
@@ -23,14 +12,12 @@
 
     import duckdb
 
-# gate: OUT-row fetch fully materializes every row to Python -> binding-dominated, GIL-held; the engine side is
-# a cheap range() scan. Deterministic under Callgrind -> instruction-count gate-able. (The small-N *_gate tests
-# are the compile+fetch fixed-cost variants; see MEAS-1.) The `con` fixture + threads=1 live in conftest.py.
+# gate: OUT-row fetch materializes every row to Python (binding-dominated); the range() scan is cheap.
 pytestmark = pytest.mark.gate
 
-# env-gated row counts (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. The 2048
-# small-N *_gate probes are intentionally NOT scaled (they are the compile+fetch fixed-cost baseline).
-N_ROW = scaled(200_000)  # per-row-object numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128)
+# scaled() shrinks N under BENCH_SCALE in the CI sweep; full N locally. The range(2048) *_gate probes are the
+# compile+fetch fixed-cost baseline and are deliberately NOT scaled.
+N_ROW = scaled(200_000)  # numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128)
 N_STR = scaled(100_000)  # varchar/blob/mixed-wide/timestamptz + fetchone/fetchmany loops
 N_NEST = scaled(50_000)  # heterogeneous scalar/list/struct row
 
@@ -41,32 +28,26 @@ def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection,
 
 
 def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a single BIGINT column."""
     _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a single INTEGER column."""
     _bench_fetchall(benchmark, con, f"SELECT (i % 100)::INTEGER AS a FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a single DOUBLE column."""
     _bench_fetchall(benchmark, con, f"SELECT (i * 1.5)::DOUBLE AS a FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of two BIGINT columns."""
     _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a single VARCHAR column."""
     _bench_fetchall(benchmark, con, f"SELECT ('str_value_' || i) AS s FROM range({N_STR}) t(i)")
 
 
 def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a mixed scalar/list/struct row."""
     query = (
         "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, "
         f"{{'a': i, 'b': i + 1}} AS st FROM range({N_NEST}) t(i)"
@@ -75,7 +56,6 @@ def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnect
 
 
 def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark iterating a result one row at a time with fetchone."""
     query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)"
 
     def run() -> None:
@@ -86,52 +66,40 @@ def run() -> None:
     benchmark(run)
 
 
-# --------------------------------------------------------------------------- #
-# small-N COMPILE+FETCH FIXED-COST variants: at range(2048) the measured region is dominated by SQL front-end
-# compilation + the engine, NOT fetch. MEAS-1 walltime split (vs the range(2048) engine floor in
-# test_engine_control_perf.py): ~40% fetch fixed-cost, ~60% compile+engine. They still catch a fixed-cost
-# regression, but they are compile+fetch fixed-cost gates, not pure-fetch gates. Plus expensive scalar OUT-row
-# types (timestamptz pytz-per-row, blob, null-heavy), a heterogeneous per-cell-dispatch row
-# (hugeint+uuid+decimal128+varchar, distinct from the homogeneous columns), and the batched fetchmany loop.
-# --------------------------------------------------------------------------- #
+# small-N *_gate variants: at range(2048) the measured region is ~60% SQL compile + engine, ~40% fetch, so these
+# catch a fixed-cost regression (not a pure per-row one). Plus expensive scalar types (timestamptz pytz-per-row,
+# blob, null-heavy), a heterogeneous per-cell-dispatch row, and the batched fetchmany loop.
 
 
 def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark the small-N BIGINT compile+fetch fixed-cost (MEAS-1: ~60% compile+engine, ~40% fetch)."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)")
 
 
 def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark the small-N two-BIGINT compile+fetch fixed-cost."""
     _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)")
 
 
 def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a half-NULL BIGINT column."""
     _bench_fetchall(benchmark, con, f"SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a TIMESTAMPTZ column."""
     _bench_fetchall(
         benchmark, con, f"SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range({N_STR}) t(i)"
     )
 
 
 def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a 128-bit DECIMAL column."""
     _bench_fetchall(benchmark, con, f"SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range({N_ROW}) t(i)")
 
 
 def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a BLOB column."""
     _bench_fetchall(benchmark, con, f"SELECT ('blob_value_' || i)::BLOB FROM range({N_STR}) t(i)")
 
 
 def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchall of a heterogeneous wide-type row."""
-    # heterogeneous row -> per-cell type dispatch in the Fetchone column loop (distinct branch/cache profile
-    # from the homogeneous single-type columns above)
+    # heterogeneous row: per-cell type dispatch in the Fetchone loop (distinct branch/cache profile from the
+    # homogeneous single-type columns above)
     query = (
         "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, "
         f"((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range({N_STR}) t(i)"
@@ -140,7 +108,6 @@ def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyCo
 
 
 def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark draining a result with batched fetchmany."""
     query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)"
 
     def run() -> None:
diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py
index c54ddba7..3478ea1c 100644
--- a/benchmarks/test_ingest_native_perf.py
+++ b/benchmarks/test_ingest_native_perf.py
@@ -1,15 +1,7 @@
-"""CodSpeed benchmark: native Python-object ingest (list/tuple/dict -> duckdb). Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_ingest_native_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
+"""Native Python-object ingest: values() list/tuple/dict, executemany. See benchmarks/README.md.
 
 Every cell goes through TransformPythonValue; dicts recurse to STRUCT; executemany re-binds per row. Note: one
 list arg to values() is ONE row whose columns are the list items, so a list of N items transforms N cells.
-executemany writes to a real table (CREATE OR REPLACE each round so it doesn't grow across repeats).
 """
 
 from __future__ import annotations
@@ -24,49 +16,39 @@
 
     import duckdb
 
-# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep.
 EXECMANY_N = scaled(20_000)  # executemany re-binds + executes per row, keep moderate
 WIDE_N = scaled(10_000)  # values() builds a 1-row x N-col relation; cap N so the binder stays sane
 
-# gate: native ingest eagerly transforms every cell (TransformPythonValue) / re-binds per row (executemany);
-# the engine side (a trivial INSERT or a 1-row-wide fetchall drain) is negligible -> binding-dominated, GIL-held,
-# deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py.
+# gate: native ingest eagerly transforms every cell / re-binds per row; the engine side is negligible.
 pytestmark = pytest.mark.gate
 
 
 @pytest.fixture(scope="module")
 def rows_3col() -> list[tuple[int, float, str]]:
-    """Return parameter rows for a 3-column executemany."""
     return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)]
 
 
 @pytest.fixture(scope="module")
 def scalars_wide() -> list[int]:
-    """Return a wide row of scalar ints for values()."""
     return list(range(WIDE_N))
 
 
 @pytest.fixture(scope="module")
 def tuples_wide() -> list[tuple[int, int, int]]:
-    """Return a wide row of tuples for values()."""
     return [(i, i + 1, i + 2) for i in range(WIDE_N)]
 
 
 @pytest.fixture(scope="module")
 def dicts_wide() -> list[dict[str, int | str]]:
-    """Return a wide row of dicts for values()."""
     return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)]
 
 
-# --------------------------------------------------------------------------- #
-# executemany: bind + execute one parameter set per row, into a real table.
-# --------------------------------------------------------------------------- #
+# executemany: bind + execute one parameter set per row, into a real table (CREATE OR REPLACE so it doesn't grow).
 
 
 def test_ingest_executemany_3col(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, rows_3col: list[tuple[int, float, str]]
 ) -> None:
-    """Benchmark executemany INSERT of 3-column rows."""
     con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
     con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)  # warm
 
@@ -77,15 +59,12 @@ def run() -> None:
     benchmark(run)
 
 
-# --------------------------------------------------------------------------- #
 # values(): EAGER per-cell TransformPythonValue. Drain with fetchall to complete the round-trip.
-# --------------------------------------------------------------------------- #
 
 
 def test_ingest_values_scalars(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, scalars_wide: list[int]
 ) -> None:
-    """Benchmark values() over a wide row of scalars."""
     con.values(scalars_wide).fetchall()  # warm
     benchmark(lambda: con.values(scalars_wide).fetchall())
 
@@ -93,7 +72,6 @@ def test_ingest_values_scalars(
 def test_ingest_values_tuples(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, tuples_wide: list[tuple[int, int, int]]
 ) -> None:
-    """Benchmark values() over a wide row of tuples."""
     # each tuple cell -> LIST value (TransformPythonValue recursion)
     con.values(tuples_wide).fetchall()  # warm
     benchmark(lambda: con.values(tuples_wide).fetchall())
@@ -102,7 +80,6 @@ def test_ingest_values_tuples(
 def test_ingest_values_dicts(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, dicts_wide: list[dict[str, int | str]]
 ) -> None:
-    """Benchmark values() over a wide row of dicts."""
     # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion)
     con.values(dicts_wide).fetchall()  # warm
     benchmark(lambda: con.values(dicts_wide).fetchall())
diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py
index 73b99d0d..61244d2c 100644
--- a/benchmarks/test_ingest_numpy_perf.py
+++ b/benchmarks/test_ingest_numpy_perf.py
@@ -1,16 +1,7 @@
-"""CodSpeed benchmark: numpy ingest paths (numpy / numpy-backed pandas -> duckdb). Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_ingest_numpy_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-Covers the object-string scan (ASCII zero-copy vs transcode ladder), the NaN->NULL float loop, the masked
-scan, and analyzer bind. Gotchas: the object-string benchmark MUST mix ASCII + non-ASCII + a null or it misses
-the ladder; analyzer bind is the one place count(*) is correct (cost is at bind, not scan) while every other
-READ aggregates over real columns.
+"""numpy ingest: object-string scan, NaN-to-NULL, masked scan, analyzer bind. See benchmarks/README.md.
+
+Gotchas: the object-string bench MUST mix ASCII + non-ASCII + a null or it misses the transcode ladder (see
+README traps); analyzer bind is the one place count(*) is correct (cost is at bind, not scan).
 """
 
 from __future__ import annotations
@@ -28,43 +19,37 @@
 
     import duckdb
 
-# env-gated (INFRA-4): scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below.
+# scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below.
 N = scaled(500_000)
 ANALYZER_N = scaled(200_000)
 
-# Registered explicitly via con.register (MEAS-3) rather than resolved by replacement-scan frame inspection.
 NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}
 
-# Mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only).
+# mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only)
 _MIXED = ["ascii_value_", "café_", "naïve_", "日本語_", None]
 _MIXED_STRINGS = [None if _MIXED[i % 5] is None else f"{_MIXED[i % 5]}{i}" for i in range(N)]
 
-# Mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind.
+# mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind
 _MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)]
 
-
-# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()`/`sum(length())` over a registered
-# frame) are engine-aggregate dominated -> informational. The analyzer BIND (count(*), no scan) is a pure
-# per-bind binding cost -> gate.
+# READ (sum over a registered frame) is engine-aggregate dominated -> informational. The analyzer BIND (count(*),
+# no scan) is a pure per-bind binding cost -> gate.
 
 
 @pytest.fixture(scope="module")
 def df_double_with_nan() -> pd.DataFrame:
-    """Return a numpy-backed double frame with real NaNs."""
     a = np.arange(N, dtype="float64") * 1.5
-    a[::10] = np.nan  # real NaNs -> NaN->NULL conversion loop
+    a[::10] = np.nan  # real NaNs -> NaN-to-NULL conversion loop
     return pd.DataFrame({"a": a})
 
 
 @pytest.fixture(scope="module")
 def df_object_string_mixed() -> pd.DataFrame:
-    """Return an object-string frame mixing ASCII, non-ASCII, and nulls."""
     return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)})
 
 
 @pytest.fixture(scope="module")
 def df_masked_int() -> pd.DataFrame:
-    """Return a nullable-Int64 frame that scans masked."""
     # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask
     arr = pd.array(np.arange(N), dtype="Int64")
     arr[::10] = pd.NA
@@ -73,20 +58,15 @@ def df_masked_int() -> pd.DataFrame:
 
 @pytest.fixture(scope="module")
 def df_object_mixed_types() -> pd.DataFrame:
-    """Return an object frame of mixed python types for analyzer bind."""
     return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)})
 
 
-# --------------------------------------------------------------------------- #
-# READ: numpy -> duckdb. Engine scans every value (sum/length force it).
-# --------------------------------------------------------------------------- #
+# READ: numpy -> duckdb. sum/length force a full scan.
 
 
 @pytest.mark.informational
 def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark scanning a registered numpy dict-of-arrays."""
-    # MEAS-3: register explicitly (not frame-inspection replacement scan) and warm the query before measuring.
-    con.register("npdict", NPDICT)
+    con.register("npdict", NPDICT)  # register explicitly, not via replacement-scan frame inspection
     con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall())
 
@@ -95,9 +75,8 @@ def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDB
 def test_read_numpy_double_with_nan(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame
 ) -> None:
-    """Benchmark scanning a numpy double column with NaNs."""
     con.register("t", df_double_with_nan)
-    con.execute("SELECT sum(a) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT sum(a) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
 
 
@@ -105,9 +84,8 @@ def test_read_numpy_double_with_nan(
 def test_read_numpy_masked_int(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame
 ) -> None:
-    """Benchmark scanning a masked nullable-int column."""
     con.register("t", df_masked_int)
-    con.execute("SELECT sum(a) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT sum(a) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
 
 
@@ -115,23 +93,19 @@ def test_read_numpy_masked_int(
 def test_read_numpy_object_string_mixed(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame
 ) -> None:
-    """Benchmark scanning a mixed object-string column."""
     con.register("t", df_object_string_mixed)
-    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
-# --------------------------------------------------------------------------- #
-# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY -- the cost is at bind, not scan, so we
-# must NOT force a scan (that would drown the per-bind analyzer signal). Re-binds the object column each call.
-# --------------------------------------------------------------------------- #
+# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY: the cost is at bind, so forcing a scan would
+# drown the per-bind signal. Re-binds the object column each call.
 
 
-@pytest.mark.gate  # count(*) forces no scan -> the measured cost is the PandasAnalyzer per-bind sampling (binding)
+@pytest.mark.gate
 def test_bind_analyzer_object(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame
 ) -> None:
-    """Benchmark the analyzer bind of a mixed-type object column."""
     con.register("t", df_object_mixed_types)
     con.execute("SELECT count(*) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall())
diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py
index 168f1a3d..4edc78dc 100644
--- a/benchmarks/test_pandas_perf.py
+++ b/benchmarks/test_pandas_perf.py
@@ -1,15 +1,6 @@
-"""CodSpeed benchmark: pandas read/write, numpy-backed vs arrow-backed DataFrames. Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_pandas_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-The binding path depends on column backing: numpy-backed columns take the NumpyArray scan path, arrow-backed
-(pandas ArrowDtype) take the near-zero-copy arrow path. Full consume: READ aggregates over real columns (not
-count(*)), WRITE materializes the whole frame.
+"""pandas read/write, numpy-backed vs arrow-backed frames. See benchmarks/README.md.
+
+Column backing selects the path: numpy-backed -> NumpyArray scan; arrow-backed (ArrowDtype) -> zero-copy arrow.
 """
 
 from __future__ import annotations
@@ -28,32 +19,28 @@
 
     import duckdb
 
-N = scaled(500_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
+N = scaled(500_000)
 WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
 WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
 _STRINGS = [f"str_value_{i}" for i in range(N)]
 
-# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()` over a registered frame) are
-# engine-aggregate dominated -> informational. Only the NUMPY-backed df() WRITE is binding-dominated -> gate.
-# The arrow-backed WRITE goes through to_arrow_table().to_pandas() (pyarrow library code, MEAS-2) -> informational.
+# READ (sum over a registered frame) is engine-aggregate dominated -> informational. Only the NUMPY-backed df()
+# WRITE is binding-dominated -> gate; the arrow-backed WRITE goes through pyarrow's to_pandas -> informational.
 
 
 @pytest.fixture(scope="module")
 def df_numpy_numeric() -> pd.DataFrame:
-    """Return a numpy-backed numeric frame."""
     return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5})
 
 
 @pytest.fixture(scope="module")
 def df_numpy_string() -> pd.DataFrame:
-    """Return a numpy-backed object-string frame."""
-    # explicit object dtype -> classic numpy-backed object-string column (the reworked object/analyzer path)
+    # explicit object dtype -> the reworked numpy-backed object-string / analyzer path
     return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)})
 
 
 @pytest.fixture(scope="module")
 def df_arrow_numeric() -> pd.DataFrame:
-    """Return an arrow-backed numeric frame."""
     return pd.DataFrame(
         {
             "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())),
@@ -64,22 +51,18 @@ def df_arrow_numeric() -> pd.DataFrame:
 
 @pytest.fixture(scope="module")
 def df_arrow_string() -> pd.DataFrame:
-    """Return an arrow-backed string frame."""
     return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))})
 
 
-# --------------------------------------------------------------------------- #
-# READ: pandas -> duckdb. Engine scans every value (sum/length force it).
-# --------------------------------------------------------------------------- #
+# READ: pandas -> duckdb. sum/length force a full scan.
 
 
 @pytest.mark.informational
 def test_read_pandas_numpy_numeric(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame
 ) -> None:
-    """Benchmark scanning a numpy-backed numeric frame."""
     con.register("t", df_numpy_numeric)
-    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
 
 
@@ -87,9 +70,8 @@ def test_read_pandas_numpy_numeric(
 def test_read_pandas_numpy_string(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame
 ) -> None:
-    """Benchmark scanning a numpy-backed string frame."""
     con.register("t", df_numpy_string)
-    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
@@ -97,9 +79,8 @@ def test_read_pandas_numpy_string(
 def test_read_pandas_arrow_numeric(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame
 ) -> None:
-    """Benchmark scanning an arrow-backed numeric frame."""
     con.register("t", df_arrow_numeric)
-    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
 
 
@@ -107,39 +88,28 @@ def test_read_pandas_arrow_numeric(
 def test_read_pandas_arrow_string(
     benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame
 ) -> None:
-    """Benchmark scanning an arrow-backed string frame."""
     con.register("t", df_arrow_string)
-    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm (MEAS-3)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm
     benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
 
 
-# --------------------------------------------------------------------------- #
-# WRITE: duckdb -> pandas. df() is NUMPY-backed (the reworked production path);
-# the arrow-backed frame goes via duckdb-arrow + pyarrow.to_pandas(ArrowDtype).
-# Both eagerly materialize the whole DataFrame.
-# --------------------------------------------------------------------------- #
+# WRITE: duckdb -> pandas. df() is the reworked numpy-backed path; the arrow-backed frame goes via
+# duckdb-arrow + pyarrow.to_pandas(ArrowDtype). Both eagerly materialize the whole frame.
 
 
 @pytest.mark.gate
 def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a numeric result to a numpy-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).df())
 
 
 @pytest.mark.gate
 def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a string result to a numpy-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).df())
 
 
-# ADDED: the numpy-backed df() WRITE with REAL nulls -> the masked_array build + masked->pd.NA rewrite that the
-# cutover reworked (a no-null column takes the cheap std::move path and would measure the wrong thing), plus a
-# datetime column (TimestampConvert + ConvertDateTimeTypes).
-
-
 @pytest.mark.gate
 def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a null-heavy numeric result to a numpy-backed frame."""
+    # REAL nulls -> the masked_array build + masked-to-pd.NA rewrite the cutover reworked (see README traps)
     q = (
         "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
         f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)"
@@ -149,18 +119,15 @@ def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con:
 
 @pytest.mark.gate
 def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a timestamp result to a numpy-backed frame."""
     q = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)"
     benchmark(lambda: con.sql(q).df())
 
 
-@pytest.mark.informational  # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2)
+@pytest.mark.informational  # to_pandas() half is pyarrow library code
 def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a numeric result to an arrow-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
 
 
-@pytest.mark.informational  # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2)
+@pytest.mark.informational  # to_pandas() half is pyarrow library code
 def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark materializing a string result to an arrow-backed frame."""
     benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py
index eb54f91c..faa053ac 100644
--- a/benchmarks/test_produce_numpy_perf.py
+++ b/benchmarks/test_produce_numpy_perf.py
@@ -1,15 +1,6 @@
-"""CodSpeed benchmark: columnar produce paths (df(), fetchnumpy(), fetch_df_chunk()). Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_produce_numpy_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-Covers the with-NULLS branch (masked_array build), datetime, and wide-internal types (hugeint/uuid/decimal128).
-Gotcha: NULL benchmarks use real DuckDB nulls (CASE WHEN); a no-null column takes the cheap path and measures
-the wrong thing. Full consume: df()/fetchnumpy() materialize the columns; fetch_df_chunk is drained in a loop.
+"""Columnar produce: df(), fetchnumpy(), fetch_df_chunk(), per type, null vs no-null. See benchmarks/README.md.
+
+Covers the with-NULLS masked_array branch, datetime, and wide-internal types (hugeint/uuid/decimal128).
 """
 
 from __future__ import annotations
@@ -23,12 +14,12 @@
 from _scale import scaled
 
 import duckdb
-import numpy as np  # noqa: F401  (pinned identically A/B; imported so the env matches the other modules)
+import numpy as np  # noqa: F401  (pinned identically A/B so the env matches the other modules)
 
 if TYPE_CHECKING:
     from pytest_codspeed import BenchmarkFixture
 
-N = scaled(500_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
+N = scaled(500_000)
 TYPE_N = scaled(200_000)  # wide-internal types (hugeint/uuid/decimal128) are heavier per cell
 
 Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
@@ -43,8 +34,7 @@
 Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)"
 
 
-# gate: df()/fetchnumpy() fully materialize numpy-backed columns -> binding-dominated (ArrayWrapper fill),
-# GIL-held, deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py.
+# gate: df()/fetchnumpy() fully materialize numpy-backed columns (ArrayWrapper fill, binding-dominated).
 def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
     con.sql(query).df()  # warm
     benchmark(lambda: con.sql(query).df())
@@ -55,80 +45,59 @@ def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, qu
     benchmark(lambda: con.sql(query).fetchnumpy())
 
 
-# --------------------------------------------------------------------------- #
-# df(): the production NUMPY-backed columnar path. no-null vs REAL-null vs string vs timestamp.
-# --------------------------------------------------------------------------- #
+# df(): the production numpy-backed columnar path. no-null vs REAL-null vs string vs timestamp vs wide types.
 
 
 @pytest.mark.gate
 def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a numeric result."""
     _bench_df(benchmark, con, Q_NUM)
 
 
 @pytest.mark.gate
 def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a null-heavy numeric result."""
-    # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch)
-    _bench_df(benchmark, con, Q_NUM_NULLS)
+    _bench_df(benchmark, con, Q_NUM_NULLS)  # REAL nulls -> masked_array branch (see README traps)
 
 
 @pytest.mark.gate
 def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a string result."""
     _bench_df(benchmark, con, Q_STR)
 
 
 @pytest.mark.gate
 def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a timestamp result."""
     _bench_df(benchmark, con, Q_TS)
 
 
 @pytest.mark.gate
 def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a hugeint result."""
     _bench_df(benchmark, con, Q_HUGEINT)
 
 
 @pytest.mark.gate
 def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a uuid result."""
     _bench_df(benchmark, con, Q_UUID)
 
 
 @pytest.mark.gate
 def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark df() of a 128-bit decimal result."""
     _bench_df(benchmark, con, Q_DEC128)
 
 
-# --------------------------------------------------------------------------- #
-# fetchnumpy(): same FetchNumpyInternal without the DataFrame wrap.
-# --------------------------------------------------------------------------- #
+# fetchnumpy(): same FetchNumpyInternal, without the DataFrame wrap.
 
 
 @pytest.mark.gate
 def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchnumpy() of a numeric result."""
     _bench_numpy(benchmark, con, Q_NUM)
 
 
 @pytest.mark.gate
 def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark fetchnumpy() of a null-heavy numeric result."""
     _bench_numpy(benchmark, con, Q_NUM_NULLS)
 
 
-# --------------------------------------------------------------------------- #
-# fetch_df_chunk(): per-chunk DataFrame production, drained in a loop.
-# --------------------------------------------------------------------------- #
-
-
-@pytest.mark.informational  # per-chunk streaming drain (GIL-per-chunk) -> walltime-informational, not gated
+@pytest.mark.informational  # per-chunk streaming drain (GIL-per-chunk), not gated
 def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark draining a result with fetch_df_chunk()."""
-
     def run() -> int:
         rel = con.sql(Q_NUM)
         rows = 0
@@ -143,39 +112,23 @@ def run() -> int:
     benchmark(run)
 
 
-# --------------------------------------------------------------------------- #
-# torch(): FetchNumpyInternal + per-column from_numpy. SKIPPED cleanly if torch is absent (identical A/B).
-# --------------------------------------------------------------------------- #
-
-
-@pytest.mark.informational  # torch is local-only (importorskip -> skipped in CI); torch lib work dilutes it
+@pytest.mark.informational  # torch is local-only (importorskip); torch lib work dilutes it
 def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark torch() of a numeric result (skipped if torch is absent)."""
     pytest.importorskip("torch")
     q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)"
     con.sql(q).torch()  # warm
     benchmark(lambda: con.sql(q).torch())
 
 
-# --------------------------------------------------------------------------- #
-# MEMORY GUARD (secondary signal, not a codspeed benchmark). codspeed walltime tracks neither memory nor
-# allocations, and conversion regressions are often memory-shaped (the recorded fetchall list->tuple edge-copy;
-# the df() masked_array branch). We use tracemalloc to capture the PEAK Python-tracked allocation of ONE
-# df()-with-nulls call. Correctness notes:
-#   * reset_peak() is called AFTER the warm (and after freeing the warm result) so the warm does not establish
-#     a high-water mark that swallows the measured call -- the prior getrusage(ru_maxrss) version was broken
-#     precisely because ru_maxrss is monotonic and the warm pre-set the peak, making the delta ~0.
-#   * tracemalloc reports BYTES on every platform (no macOS-bytes / Linux-KiB skew that the getrusage version
-#     had), so the ceiling is portable to the Linux CI target.
-# CAVEAT: tracemalloc only sees Python-level allocations; the raw numpy column buffers are allocated in C and
-# are NOT visible here. So this catches a gross PYTHON-object-shaped blowup (the masked->pd.NA rewrite / a
-# per-row object materialization regression) but is not a total-RSS gate -- the authoritative CI gate for the
-# C-buffer payload is codspeed memory mode (--codspeed-mode=memory).
-# --------------------------------------------------------------------------- #
+# Memory guard (secondary signal, not a codspeed benchmark; codspeed walltime tracks neither memory nor allocs).
+# tracemalloc captures the PEAK Python-tracked allocation of ONE df()-with-nulls call. reset_peak() runs AFTER
+# the warm so the warm does not set a high-water mark that swallows the measured call. tracemalloc reports bytes
+# on every platform (portable to Linux CI). CAVEAT: it only sees Python-level allocs, not the C numpy buffers, so
+# it catches a gross Python-object blowup (masked-to-pd.NA gone wrong) but is not a total-RSS gate; that is
+# codspeed memory mode's job (deferred, see PLAN.md).
 
 
 def test_mem_df_with_nulls() -> None:
-    """Guard the Python-tracked peak allocation of a null-heavy df() call."""
     con = duckdb.connect(config={"threads": 1})
     try:
         tracemalloc.start()
@@ -190,7 +143,6 @@ def test_mem_df_with_nulls() -> None:
     finally:
         con.close()
     print(f"\n[mem] df()-with-nulls tracemalloc peak = {peak / 1e6:.1f} MB", file=sys.stderr)
-    # Python-tracked allocations for a 500k x 2-col masked df are a few MB; a gross conversion-memory blowup
-    # (e.g. a per-row Python object list, the masked->pd.NA rewrite gone wrong) is tens+ MB. 100 MB ceiling
-    # catches that without flaking, and is bytes on all platforms.
+    # a 500k x 2-col masked df is a few MB of Python-tracked allocs; a gross blowup is tens+ MB. 100 MB ceiling
+    # catches that without flaking.
     assert peak < 100_000_000
diff --git a/benchmarks/test_relational_construction_perf.py b/benchmarks/test_relational_construction_perf.py
index 5b386da5..bd494c2e 100644
--- a/benchmarks/test_relational_construction_perf.py
+++ b/benchmarks/test_relational_construction_perf.py
@@ -1,17 +1,8 @@
-"""CodSpeed benchmark: relational-API expression construction. Standalone, not in CI's binding gate.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_relational_construction_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-SCOPE: this is relational-API *construction* (ColumnExpression / ConstantExpression / operator overloads),
-NOT the binding-pressure surface the rest of the suite targets. It was moved here out of test_fetch_perf.py
-(MEAS-5) because it is out of scope for the binding-pressure gate. It is KEPT because it carries a real signal
-(a measured ~35% expression-construction delta at the cutover), so it stays visible -- but it is marked
-`informational`, so it runs and reports and is NEVER part of the gate.
+"""Relational-API expression construction. Informational, out of the binding gate. See benchmarks/README.md.
+
+This is expression *construction* (ColumnExpression / ConstantExpression / operator overloads), not the
+binding-pressure surface the rest of the suite targets. Kept because it carries a real signal (a measured ~35%
+construction delta at the cutover), but never part of the gate.
 """
 
 from __future__ import annotations
@@ -25,13 +16,10 @@
 if TYPE_CHECKING:
     from pytest_codspeed import BenchmarkFixture
 
-# informational: relational-API construction, deliberately excluded from the binding-pressure gate (MEAS-5).
 pytestmark = pytest.mark.informational
 
 
 def test_expr_many(benchmark: BenchmarkFixture) -> None:
-    """Benchmark building many column/constant expressions."""
-
     def run() -> int:
         out = []
         for i in range(2000):
diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py
index f0323fea..2d3ae95d 100644
--- a/benchmarks/test_types_roundtrip_perf.py
+++ b/benchmarks/test_types_roundtrip_perf.py
@@ -1,16 +1,7 @@
-"""CodSpeed benchmark: the type x direction produce matrix (fetchall / df / to_arrow per type). Standalone, not in CI.
-
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_types_roundtrip_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-One logical type per column across three directions, so a regression localizes to (type, direction). Includes
-the wide types the narrow-numeric benchmarks miss: hugeint, uuid, decimal128, long varchar. Note: to_arrow on a
-materialized result re-runs the query with the GIL released, so the arrow column is engine-parallel and
-walltime-noisy: informational, not a hard gate.
+"""type x direction produce matrix: fetchall / df / to_arrow per logical type. See benchmarks/README.md.
+
+One logical type per column across three directions, so a regression localizes to (type, direction). Includes the
+wide types the narrow-numeric benches miss: hugeint, uuid, decimal128, long varchar.
 """
 
 from __future__ import annotations
@@ -25,7 +16,7 @@
 
     import duckdb
 
-N = scaled(100_000)  # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4)
+N = scaled(100_000)
 
 # one logical type per column; long-varchar is intentionally > 64 chars
 TYPE_EXPR = {
@@ -46,34 +37,29 @@
 TYPES = list(TYPE_EXPR)
 
 
-# `con` fixture + threads=1 live in conftest.py.
 def _query(type_name: str) -> str:
     return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)"
 
 
-@pytest.mark.gate  # OUT-row fetchall -> binding-dominated per-type dispatch
+@pytest.mark.gate  # OUT-row: binding-dominated per-type dispatch
 @pytest.mark.parametrize("type_name", TYPES)
 def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
-    """Benchmark fetchall of one logical type per column."""
     q = _query(type_name)
     con.execute(q).fetchall()  # warm
     benchmark(lambda: con.execute(q).fetchall())
 
 
-@pytest.mark.gate  # OUT-col df() -> binding-dominated ArrayWrapper fill per type
+@pytest.mark.gate  # OUT-col: binding-dominated ArrayWrapper fill per type
 @pytest.mark.parametrize("type_name", TYPES)
 def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
-    """Benchmark df() of one logical type per column."""
     q = _query(type_name)
     con.sql(q).df()  # warm
     benchmark(lambda: con.sql(q).df())
 
 
-@pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated
+@pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel, noisy) -> not gated
 @pytest.mark.parametrize("type_name", TYPES)
 def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
-    """Benchmark to_arrow_table() of one logical type per column (informational only)."""
-    # informational only: PromoteMaterializedToArrow re-runs the query with the GIL released (noisy)
     q = _query(type_name)
     con.sql(q).to_arrow_table()  # warm
     benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py
index a62be815..0f381ca7 100644
--- a/benchmarks/test_udf_perf.py
+++ b/benchmarks/test_udf_perf.py
@@ -1,14 +1,6 @@
-"""CodSpeed benchmark: Python UDF paths (native scalar + vectorized arrow). Standalone, not in CI.
+"""Python UDFs: native scalar (one call per row) and vectorized arrow (one call per chunk). See benchmarks/README.md.
 
-A/B: run under each build, compare (data libs pinned identically, so the delta is the binding):
-  cd /Users/evert/projects/duckdb-python/wt-codspeed
-  for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \
-    $P -m pytest benchmarks/test_udf_perf.py \
-    --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \
-  done
-
-Native scalar = one Python call per row (arg build + PyObject_CallObject + result transform); arrow = one call
-per chunk. Full consume: each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row.
+Each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row.
 """
 
 from __future__ import annotations
@@ -28,46 +20,39 @@
 pa = pytest.importorskip("pyarrow")
 pc = pytest.importorskip("pyarrow.compute")
 
-# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep.
 NATIVE_N = scaled(200_000)  # native = one Python call per row, keep moderate
 ARROW_N = scaled(1_000_000)  # arrow = one Python call per chunk (vectorized), can be large
 
 
-# `con` fixture + threads=1 live in conftest.py.
 def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
-    con.execute(query).fetchall()  # warm the engine + import caches before measuring
+    con.execute(query).fetchall()  # warm the engine + import caches
     benchmark(lambda: con.execute(query).fetchall())
 
 
-# --------------------------------------------------------------------------- #
-# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result).
-# --------------------------------------------------------------------------- #
+# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result). The Python
+# call dominates; the sum() consume is negligible -> gate.
 
 
-@pytest.mark.gate  # native scalar UDF: one Python call per row dominates; the sum() consume is negligible
+@pytest.mark.gate
 def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark a 1-arg native int scalar UDF."""
     con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
     _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)")
 
 
 @pytest.mark.gate
 def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark a 2-arg native int scalar UDF."""
     con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT)
     _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)")
 
 
 @pytest.mark.gate
 def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark a 1-arg native double scalar UDF."""
     con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE)
     _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)")
 
 
 @pytest.mark.gate
 def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark a native string scalar UDF."""
     con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR)
     _bench(
         benchmark,
@@ -78,9 +63,8 @@ def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConn
 
 @pytest.mark.gate
 def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark the validity short-circuit for NULL inputs to a native UDF."""
-    # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the
-    # validity short-circuit, not the Python call, so the UDF only ever sees non-NULL rows.
+    # DEFAULT null handling short-circuits NULL inputs (SetNull) WITHOUT calling the UDF: measures the validity
+    # short-circuit, so the UDF only ever sees non-NULL rows.
     con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
     _bench(
         benchmark,
@@ -90,30 +74,26 @@ def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBP
     )
 
 
-# --------------------------------------------------------------------------- #
-# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast.
-# --------------------------------------------------------------------------- #
+# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast. pyarrow lib
+# work + per-chunk conversion + 1M engine -> informational.
 
 
-@pytest.mark.informational  # vectorized arrow UDF: pyarrow.compute lib work + per-chunk conversion + 1M engine
+@pytest.mark.informational
 def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark a vectorized arrow int UDF."""
     con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
     _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)")
 
 
 @pytest.mark.informational
 def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark a vectorized arrow double UDF."""
     con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow")
     _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)")
 
 
 @pytest.mark.informational
 def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
-    """Benchmark the selvec compaction for NULL inputs to a vectorized arrow UDF."""
-    # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call
-    # and reconstructs the result vector afterwards -- this is the selvec compaction/reconstruction cost.
+    # DEFAULT null handling on the vectorized path compacts the validity (selvec) before the call and reconstructs
+    # the result vector after: this measures the selvec compaction/reconstruction cost.
     con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
     _bench(
         benchmark,
diff --git a/pyproject.toml b/pyproject.toml
index 90218094..12cad096 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -281,10 +281,8 @@ test = [ # dependencies used for running tests
     "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'",
     "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'",
 ]
-bench = [ # minimal, pinned deps for the CodSpeed benchmark suite (.github/workflows/codspeed.yml). Deliberately
-          # NOT the heavy `test` group (no torch/tensorflow/pyspark/adbc). Pinned via uv.lock and kept in lockstep
-          # with any baseline compared against, so the only cross-run delta is the binding. Constraints mirror the
-          # `test` group so the lockfile resolves identically. torch/tf produce paths are local-only (importorskip).
+bench = [ # Pinned deps for the benchmark suite (see benchmarks/README.md). Minimal, not the heavy `test` group.
+          # Constraints mirror `test` so the lockfile resolves identically; torch/tf are local-only (importorskip).
     "pytest",
     "pytest_codspeed",
     "polars>=1.33.0",
@@ -457,6 +455,10 @@ strict = true
     # No need for type hinting in tests
     'ANN001', 'ANN201', 'ANN202'
 ]
+"benchmarks/**.py" = [
+    # benchmarks are test-like: docstrings optional (shared context lives in benchmarks/README.md)
+    'D100', 'D101', 'D102', 'D103', 'D104', 'D105', 'D107',
+]
 "tests/fast/spark/**.py" = [
     "E402"
 ]
diff --git a/tests/fast/test_binding_pressure_leak.py b/tests/fast/test_binding_pressure_leak.py
index 22de87b2..1ffd596c 100644
--- a/tests/fast/test_binding_pressure_leak.py
+++ b/tests/fast/test_binding_pressure_leak.py
@@ -1,12 +1,8 @@
-"""Sustained-iteration leak guards for the binding object-pinning paths (COV-3).
+"""Sustained-iteration leak guards for the binding object-pinning paths.
 
-Sibling of test_relation_dependency_leak.py. CodSpeed measures steady-state PER-CALL cost and structurally cannot
-see a per-call refcount imbalance in the object-pinning graph (ExternalDependency / registered_objects / a UDF's
-retained Python callable) until it OOMs. This is a plain assertion test (NOT a codspeed benchmark, no marker): it
-runs each pinning path N times and asserts RSS and Python-object growth stay flat.
-
-Covers the paths the existing leak test does not: register/unregister, native + arrow UDF create/run/remove, and
-executemany. (from_arrow/from_df/replacement-scan pinning is already covered by test_relation_dependency_leak.py.)
+CodSpeed measures per-call cost and can't see a refcount imbalance in the object-pinning graph until it OOMs, so
+this plain assertion test runs each pinning path N times and asserts RSS and object growth stay flat. Covers what
+test_relation_dependency_leak.py does not: register/unregister, native + arrow UDF create/run/remove, executemany.
 """
 
 import gc

From 527755617ac4a843679b0101ae5c949442cb032f Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Thu, 2 Jul 2026 09:13:29 +0200
Subject: [PATCH 6/7] fix bugs, add baseline

---
 .github/workflows/codspeed.yml          |   9 +-
 benchmarks/baseline.json                | 972 ++++++++++++++++++++++++
 benchmarks/test_produce_numpy_perf.py   |   4 +-
 benchmarks/test_types_roundtrip_perf.py |  11 +-
 4 files changed, 989 insertions(+), 7 deletions(-)
 create mode 100644 benchmarks/baseline.json

diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
index b80323fc..2e14ebf5 100644
--- a/.github/workflows/codspeed.yml
+++ b/.github/workflows/codspeed.yml
@@ -80,10 +80,13 @@ jobs:
         run: |
           # step 1: build deps only (needed for --no-build-isolation), no project
           uv sync --only-group build --no-install-project -p 3.13
-          # step 2: build+install the project (release) + build group, without the heavy default `dev` group
-          uv sync --no-build-isolation --no-editable --reinstall --no-default-groups --group build -p 3.13
-          # step 3: the frozen bench pins (exact ==), so the only cross-run delta is the binding
+          # step 2: the frozen bench pins (exact ==), so the only cross-run delta is the binding. MUST precede the
+          # build: numpy>=2.0 is a [build-system].requires (numpy C API headers), which --no-build-isolation does
+          # not auto-install and which is not in the `build` group, so CMake's find_package(... NumPy) fails first.
           uv pip install -r benchmarks/requirements-bench.txt
+          # step 3: build+install the project (release), no default `dev` group (torch/tensorflow/pyspark). uv pip
+          # install is additive; uv sync here would prune numpy back out before the build and re-break the config.
+          uv pip install --no-build-isolation --no-deps --reinstall -C cmake.build-type=Release .
 
       - name: Collect gate node-ids
         # the gate/informational marker split; regen uses it to classify each benchmark
diff --git a/benchmarks/baseline.json b/benchmarks/baseline.json
new file mode 100644
index 00000000..fe809300
--- /dev/null
+++ b/benchmarks/baseline.json
@@ -0,0 +1,972 @@
+{
+  "meta": {
+    "schema_version": 1,
+    "generated_at_utc": "2026-07-02T06:26:46+00:00",
+    "git_commit": "090e02142b1bca4163c526ad75a4dcc84a5ae374",
+    "duckdb_submodule_sha": "d9a775e4c03b23ecb3784f879196aa81adf0ac1c",
+    "requirements_bench_sha256": "2bdfd6a766947a61559afb2799c54f0ea173b9325f55082ad809bf7b97b2c659",
+    "measurement": {
+      "tool": "valgrind callgrind",
+      "event": "Ir",
+      "pythonhashseed": "0"
+    },
+    "bench_scale": "10",
+    "gate_default_threshold_pct": 5.0,
+    "binding_fraction_cutoff": 0.25,
+    "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above."
+  },
+  "benchmarks": {
+    "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[1000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 13968509,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[2]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 13117509,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[50000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 17445483,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 7507078,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_reader_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 8566385,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 16952462,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_arrow_reader_consumed": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29404937,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_arrow_table_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29199115,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_arrow_table_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 25884569,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_polars_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29363771,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_polars_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29278882,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[10000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 49732326,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[1000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 32634030,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[100]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 31130130,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[20000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 68677642,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[10000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 61656223,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[1000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 32870219,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[100]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 30241645,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[20000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 93837059,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[10000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 41073162,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[1000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31192384,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[100]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 30319144,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[20000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 51996785,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_100k": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 3255412,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_200k": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 3253716,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_small": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 2855767,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_2col_500k": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31312283,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_2int": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 30527833,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_2int_gate": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 5144687,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_blob": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 46799205,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_decimal128": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 234831861,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_double": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 28100940,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_int": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18885980,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_int_gate": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 3207318,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_mixed": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 298310717,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_mixed_wide": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 629847376,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_null_heavy": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18497920,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_smallint": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18158437,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_str": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 36630015,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_timestamptz": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 442013591,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchmany_batched": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 44376635,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchone_iter": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 56082286,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_executemany_3col": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 20508999651,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_values_dicts": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 6300053057,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_values_scalars": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 4364660696,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_values_tuples": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 5224666337,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_bind_analyzer_object": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21109327,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_dict_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5698722,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_double_with_nan": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 4441652,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_masked_int": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 4427922,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_object_string_mixed": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 71135312,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5978439,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 16958452,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 6253482,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31577228,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31316827,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 27977539,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 29474196,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric_with_nulls": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40398312,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_string": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 69326603,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_timestamp": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21747493,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_decimal128": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 12498891,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_hugeint": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 7060301,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_numeric": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 29464799,
+      "binding_fraction": 0.0,
+      "threshold_pct": null
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_numeric_with_nulls": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40357060,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_string": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 69304377,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_timestamp": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21738267,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_uuid": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 215063593,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_fetch_df_chunk_loop": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 43497043,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 28165468,
+      "binding_fraction": 0.0,
+      "threshold_pct": null
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric_with_nulls": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 35144943,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_relational_construction_perf.py::test_expr_many": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 64025731,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[bool]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 3639613,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[date]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5143666,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal128]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11654375,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal64]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 6088232,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[double]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 12398027,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[hugeint]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 6319959,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[int64]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 2512782,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[list]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11014392,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[struct]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5119483,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[timestamp]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 10337048,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[uuid]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11291045,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_long]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 20944198,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_short]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11322686,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 3638394,
+      "binding_fraction": 0.1053,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 6410855,
+      "binding_fraction": 0.4922,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal128]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 12496882,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal64]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 6410024,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 12732237,
+      "binding_fraction": 0.7443,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[hugeint]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 7054469,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 2718974,
+      "binding_fraction": 0.0,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[list]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 91324470,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[struct]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 110991217,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[timestamp]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 10647333,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[uuid]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 215166204,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_long]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40038336,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_short]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 28326808,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[bool]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 17981967,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[date]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 23701642,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal128]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 234148728,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal64]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21656881,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[double]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 28070587,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[hugeint]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 159982348,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[int64]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18836658,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[list]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 150499447,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[struct]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 119062526,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[timestamp]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 30750748,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[uuid]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 226484384,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_long]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 49637213,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_short]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 33743613,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_arrow_double": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 102838074,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_udf_perf.py::test_udf_arrow_int": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 56453572,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_udf_perf.py::test_udf_arrow_null_inputs": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 72729269,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_double_1arg": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40772497,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_int_1arg": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 35374345,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_int_2arg": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 48207658,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_null_inputs": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 26901535,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_string": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 93255939,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    }
+  }
+}
diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py
index faa053ac..f7a103da 100644
--- a/benchmarks/test_produce_numpy_perf.py
+++ b/benchmarks/test_produce_numpy_perf.py
@@ -48,7 +48,7 @@ def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, qu
 # df(): the production numpy-backed columnar path. no-null vs REAL-null vs string vs timestamp vs wide types.
 
 
-@pytest.mark.gate
+@pytest.mark.informational
 def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     _bench_df(benchmark, con, Q_NUM)
 
@@ -86,7 +86,7 @@ def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnecti
 # fetchnumpy(): same FetchNumpyInternal, without the DataFrame wrap.
 
 
-@pytest.mark.gate
+@pytest.mark.informational
 def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
     _bench_numpy(benchmark, con, Q_NUM)
 
diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py
index 2d3ae95d..9cc8d6b3 100644
--- a/benchmarks/test_types_roundtrip_perf.py
+++ b/benchmarks/test_types_roundtrip_perf.py
@@ -36,6 +36,14 @@
 }
 TYPES = list(TYPE_EXPR)
 
+# OUT-col bool/int64 are engine-diluted below the Option-B cutoff (binding_fraction < 0.25, see baseline.json): the
+# numpy column fill is trivial next to the engine scan, so they are informational while the other types stay gate.
+# OUT-row is unaffected (fetchall builds a Python object per cell, binding-dominated for every type).
+_OUT_COL_DILUTED = {"bool", "int64"}
+_OUT_COL_PARAMS = [
+    pytest.param(t, marks=pytest.mark.informational if t in _OUT_COL_DILUTED else pytest.mark.gate) for t in TYPES
+]
+
 
 def _query(type_name: str) -> str:
     return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)"
@@ -49,8 +57,7 @@ def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConne
     benchmark(lambda: con.execute(q).fetchall())
 
 
-@pytest.mark.gate  # OUT-col: binding-dominated ArrayWrapper fill per type
-@pytest.mark.parametrize("type_name", TYPES)
+@pytest.mark.parametrize("type_name", _OUT_COL_PARAMS)  # OUT-col: ArrayWrapper fill; gate per type except diluted ones
 def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
     q = _query(type_name)
     con.sql(q).df()  # warm

From 79b5e6d8763413d298d8596672501937a14337f1 Mon Sep 17 00:00:00 2001
From: Evert Lammerts <evert.lammerts@gmail.com>
Date: Thu, 2 Jul 2026 09:23:24 +0200
Subject: [PATCH 7/7] remove cp311 from seeds

---
 .github/workflows/packaging_wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml
index 7a3bb74a..96da6227 100644
--- a/.github/workflows/packaging_wheels.yml
+++ b/.github/workflows/packaging_wheels.yml
@@ -30,7 +30,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: [ cp311, cp314 ]
+        python: [ cp314 ]
         platform:
           - { os: windows-2022,     arch: amd64,      cibw_system: win }
           - { os: windows-11-arm,   arch: ARM64,      cibw_system: win }