diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
new file mode 100644
index 00000000..2e14ebf5
--- /dev/null
+++ b/.github/workflows/codspeed.yml
@@ -0,0 +1,129 @@
+# Instruction-count (Callgrind) perf-regression gate against a COMMITTED baseline. No CodSpeed account/token/runner:
+# compare_baseline.py parses raw callgrind dumps and diffs each benchmark against benchmarks/baseline.json. Counts
+# are near-deterministic with PYTHONHASHSEED pinned (~0.1% noise), so the 5% gate threshold sits far above it.
+# Details + rationale: benchmarks/README.md and benchmarks/PLAN.md.
+#
+# Triggers: nightly schedule + manual workflow_dispatch (no pull_request/push). A dispatch on a feature branch
+# compares that branch's counts vs the baseline.json committed on it, answering "did my branch regress vs main".
+#
+# Modes (workflow_dispatch input `regen`):
+#   regen=false (default) -> COMPARE + report. Report-only for now (never fails); flip to --enforce once trusted.
+#   regen=true            -> write a fresh baseline.json + upload as an artifact to commit deliberately. Bump
+#                            requirements-bench.txt FIRST (separate commit) if the pins should change.
+#
+# The concurrency module is excluded from the sweep (Callgrind serializes threads, so its signal is meaningless).
+# Memory mode (a second sweep for produce peak-RSS) is deferred (see PLAN.md).
+
+name: Benchmarks
+
+on:
+  schedule:
+    - cron: "0 3 * * *" # nightly at 03:00 UTC
+  workflow_dispatch:
+    inputs:
+      regen:
+        description: "Regenerate benchmarks/baseline.json (upload as artifact) instead of comparing"
+        type: boolean
+        default: false
+
+concurrency:
+  group: codspeed-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  benchmarks:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90 # ~25 min sweep at BENCH_SCALE=10 (12-core Linux) + ~10 min cold build; margin for CI
+    permissions:
+      contents: read
+    env:
+      PYTHONHASHSEED: "0" # stable instruction counts for dict/struct paths
+      CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks
+      # shrink the O(rows) benches so the sweep fits under timeout-minutes. Local runs leave this unset -> full N.
+      # Recorded in baseline.json meta.bench_scale; a baseline only compares to a run at the SAME scale.
+      BENCH_SCALE: "10"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive # the DuckDB engine submodule is needed to build
+          fetch-depth: 0 # setuptools_scm needs history for version detection
+
+      - name: Resolve DuckDB submodule SHA
+        id: duckdb_sha
+        # used for the sccache key AND passed to compare_baseline.py for the engine-bump guard
+        run: echo "sha=$(git rev-parse HEAD:external/duckdb)" >> "$GITHUB_OUTPUT"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.13"
+
+      - name: Install valgrind
+        run: sudo apt-get update && sudo apt-get install -y valgrind
+
+      - name: Cache sccache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/sccache
+          key: sccache-codspeed-${{ steps.duckdb_sha.outputs.sha }}
+          restore-keys: sccache-codspeed-
+
+      - name: Install sccache
+        run: |
+          curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \
+            | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache
+
+      - name: Build the extension (release) + pinned benchmark deps
+        env:
+          CMAKE_C_COMPILER_LAUNCHER: sccache
+          CMAKE_CXX_COMPILER_LAUNCHER: sccache
+        run: |
+          # step 1: build deps only (needed for --no-build-isolation), no project
+          uv sync --only-group build --no-install-project -p 3.13
+          # step 2: the frozen bench pins (exact ==), so the only cross-run delta is the binding. MUST precede the
+          # build: numpy>=2.0 is a [build-system].requires (numpy C API headers), which --no-build-isolation does
+          # not auto-install and which is not in the `build` group, so CMake's find_package(... NumPy) fails first.
+          uv pip install -r benchmarks/requirements-bench.txt
+          # step 3: build+install the project (release), no default `dev` group (torch/tensorflow/pyspark). uv pip
+          # install is additive; uv sync here would prune numpy back out before the build and re-break the config.
+          uv pip install --no-build-isolation --no-deps --reinstall -C cmake.build-type=Release .
+
+      - name: Collect gate node-ids
+        # the gate/informational marker split; regen uses it to classify each benchmark
+        run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \
+             | grep '::' > gate_list.txt || true
+
+      - name: Run benchmarks under Callgrind (per-benchmark instruction counts)
+        # ONE sweep over gate+informational EXCEPT the concurrency module (thread-serialized, expensive). Each
+        # benchmark emits a callgrind dump keyed by its uri.
+        run: |
+          mkdir -p profiles
+          CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \
+            --callgrind-out-file="$PWD/profiles/cg.%p.%n" \
+            uv run --no-sync pytest benchmarks/ \
+              --ignore=benchmarks/test_concurrency_perf.py \
+              -m "gate or informational" --codspeed -o addopts= -p no:cacheprovider
+
+      - name: Compare against committed baseline (report-only)
+        if: ${{ !inputs.regen }}
+        # report-only: prints the delta table, never fails the job. Add --enforce once trusted.
+        run: |
+          uv run --no-sync python benchmarks/compare_baseline.py compare \
+            --profiles profiles --baseline benchmarks/baseline.json \
+            --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \
+            --pins benchmarks/requirements-bench.txt
+
+      - name: Regenerate baseline (upload artifact to commit deliberately)
+        if: ${{ inputs.regen }}
+        run: |
+          uv run --no-sync python benchmarks/compare_baseline.py regen \
+            --profiles profiles --out benchmarks/baseline.json --gate-list gate_list.txt \
+            --git-commit "${{ github.sha }}" --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \
+            --pins benchmarks/requirements-bench.txt
+
+      - name: Upload regenerated baseline
+        if: ${{ inputs.regen }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-update
+          path: benchmarks/baseline.json
diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml
index 7a3bb74a..96da6227 100644
--- a/.github/workflows/packaging_wheels.yml
+++ b/.github/workflows/packaging_wheels.yml
@@ -30,7 +30,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: [ cp311, cp314 ]
+        python: [ cp314 ]
         platform:
           - { os: windows-2022,     arch: amd64,      cibw_system: win }
           - { os: windows-11-arm,   arch: ARM64,      cibw_system: win }
diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md
new file mode 100644
index 00000000..835aef7d
--- /dev/null
+++ b/benchmarks/PLAN.md
@@ -0,0 +1,90 @@
+# Benchmark suite plan
+
+Design rationale for the binding micro-benchmarks. The suite is implemented in `benchmarks/`; CI lives in
+`../.github/workflows/codspeed.yml`; conventions, markers, and the two data-pattern traps are in
+[README.md](README.md).
+
+Priority: **P0** = known-regression or cutover-reworked path (narrow-numeric common case); **P1** = high-traffic
+conversion or per-element Python work; **P2** = correctness-relevant, lower-traffic or engine-dominated.
+
+## Scenarios
+
+PRODUCE (duckdb to Python) is the highest regression risk: `Fetchone` builds a `TupleBuilder` per row and calls
+`FromValue` per cell (O(rows x cols), the shape of the historical ~15% fetchall regression).
+
+- **OUT-row** (`test_fetch_perf`, `test_types_roundtrip_perf`): fetchall / fetchone / fetchmany per type. P0
+  narrow numeric; P1 varchar, list, struct, and the expensive per-row types (decimal `Decimal()`, timestamptz
+  pytz, hugeint string round-trip, uuid). Small-N `*_gate` probes isolate the compile+fetch fixed cost.
+- **OUT-col** (`test_produce_numpy_perf`): df() / fetchnumpy() reworked columnar path. P0 numeric no-null vs
+  REAL-null (the masked_array branch); plus string, timestamp, and wide-internal (hugeint/uuid/decimal128).
+- **OUT-arrow / polars** (`test_arrow_perf`): to_arrow_table / reader / pl(). Informational (engine-parallel,
+  GIL-released).
+- **Cardinality** (`test_cardinality_perf`): a LIMIT-n sweep giving a clean per-row conversion slope.
+
+INGEST (Python to duckdb):
+
+- **numpy / pandas** (`test_ingest_numpy_perf`, `test_pandas_perf`): numpy-backed scan (NaN-to-NULL, masked),
+  object-string transcode ladder, arrow-backed zero-copy, and the per-bind PandasAnalyzer.
+- **arrow** (`test_arrow_perf`): Table + RecordBatchReader + dictionary sweep.
+- **native** (`test_ingest_native_perf`): values() list/tuple/dict per-cell TransformPythonValue, executemany.
+
+UDF (`test_udf_perf`, zero coverage before this suite): native scalar per-row (P0, the biggest untested per-call
+path) and vectorized arrow per-chunk.
+
+## Type x direction matrix
+
+Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper),
+OUT-arrow.
+
+| Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow |
+|------|-----------|----------|---------|---------|-----------|
+| int32/int64 | P1 | **P0** | **P0** | **P0** | P1 |
+| double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 |
+| varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 |
+| bool | P2 | P1 | P2 | P1 | P2 |
+| decimal64/128 | P2 | n/a | **P1** (Python Decimal) | P1 | P2 |
+| date | P2 | P1 | P1 | P1 | P2 |
+| timestamp(tz) | P1 | P1 | **P1** (pytz/row) | P1 | P1 |
+| LIST/STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 |
+| hugeint/uuid | P2 | P2 | **P1** (round-trip) | P1 | P2 |
+| blob/map | P2 | P2 | P2 | P2 | P2 |
+| NULL-heavy | n/a | **P1** | P2 | **P0** (masked_array) | P1 |
+
+## Mechanics
+
+- **Walltime vs instruction-count.** Local A/B is walltime only (no Valgrind on macOS arm64). CI is
+  instruction-count via self-hosted Callgrind (near-deterministic, PYTHONHASHSEED pinned), diffed against a
+  committed baseline. Report-only until trusted.
+- **Marker split + auto-move.** Every benchmark is `gate` or `informational` (see README). At baseline regen,
+  each numeric-produce gate's binding fraction `= 1 - floor_Ir / bench_Ir` is computed against its engine floor
+  (`test_engine_control_perf`); a gate below the ~25% cutoff is auto-moved to informational (a threshold on an
+  engine-diluted total is not meaningful). OUT-row fetch and UDFs are ~all binding; numeric produce is a bulk
+  memcpy of ~engine magnitude (auto-move candidate).
+- **Guards.** compare_baseline.py warns and stops enforcing when BENCH_SCALE, the pin file, or the DuckDB
+  submodule SHA differ from the baseline's (any of those makes the counts non-comparable).
+- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain RSS + object-count test for the
+  object-pinning paths, since a per-call refcount imbalance is invisible to a steady-state benchmark.
+- **Memory mode** (a second Callgrind sweep for O(rows) produce peak-RSS) is designed but deferred; the
+  `test_mem_df_with_nulls` tracemalloc guard is the local stand-in.
+
+## Cross-check vs iqmo-org/bareduckdb
+
+Their suite is a SQL-file-driven A/B comparing two clients (production `duckdb` vs the C-API prototype), arrow-in
+/ arrow-out only, no fetchall/df/numpy/native/UDF coverage. So our binding suite is far broader; their genuine
+deltas concentrate in PRODUCE/types. Actionable additions they suggest:
+
+- **hugeint / uuid in the produce matrix** (they select both): OUT-row does a per-value string round-trip, distinct
+  from narrow int. Now in `test_produce_numpy_perf` / `test_fetch_perf`.
+- **int128-internal decimal** (`DECIMAL(28,x)`) alongside the int64-internal one: hits a wider cast path. Added.
+- **heterogeneous mixed-type row**: exercises per-cell type dispatch in the Fetchone loop, unlike homogeneous
+  columns. Added as `test_fetchall_mixed_wide`.
+- **long varchar (>64 char)** alongside the short string: shifts string copy / transcode toward copy-bound. Added
+  as `varchar_long` in the matrix.
+- **result-cardinality (top-N) sweep**: holds engine work ~constant while sweeping rows-to-Python. Adopted as
+  `test_cardinality_perf` (plain LIMIT, no ORDER BY; the sort swamped the signal).
+- **peak-memory guard** on the O(rows) produce paths: a conversion regression is often memory-shaped. Partially
+  covered by the tracemalloc guard; full coverage waits on memory mode.
+
+Out of scope (theirs, not adopted): pure-engine filter/group/window workloads; 100M+ row scale (IO/engine
+dominated); the free-threading category (unsupported by this client). Do NOT adopt their no-warmup single-run
+methodology (charges import-cache population into the measurement).
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..ca8f8355
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,34 @@
+# Benchmark suite
+
+CodSpeed micro-benchmarks for the binding hot paths (produce, ingest, UDF).
+Design rationale: [PLAN.md](PLAN.md). CI: [../.github/workflows/codspeed.yml](../.github/workflows/codspeed.yml).
+
+## Markers
+
+Every benchmark carries exactly one (registered in `conftest.py`):
+
+- **gate**: binding-dominated, GIL-held, deterministic under Callgrind. A threshold breach is a binding regression.
+- **informational**: engine/library/streaming-diluted. Reported, never gated (would false-positive on engine bumps).
+
+## Local A/B (walltime)
+
+Only walltime runs locally (no Valgrind on macOS arm64; instruction-count gating is Linux/CI-only, and walltime is
+noisy on sub-ms benches). Pin the data libs identically across both builds so the delta is pure binding:
+
+```bash
+for P in ../main/.venv-release/bin/python .venv-release/bin/python; do
+  $P -m pytest benchmarks/<module>.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+done
+```
+
+## Conventions
+
+- READ aggregates real columns (`sum`/`length`), never `count(*)` (answered from metadata).
+- WRITE fully materializes the result or drains the lazy reader.
+- Warm once before measuring.
+- `con` fixture pins `threads=1` (see `conftest.py`).
+
+Two traps (a benchmark that skips these silently measures the wrong thing):
+
+- OUT-col null benches need REAL nulls (`CASE WHEN ... THEN NULL`), else the cheap `std::move` path is taken.
+- IN-numpy string benches need mixed ASCII + non-ASCII + a null sentinel, else the transcode/null ladder is skipped.
diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py
new file mode 100644
index 00000000..a4049aa9
--- /dev/null
+++ b/benchmarks/_scale.py
@@ -0,0 +1,35 @@
+"""Env-gated row-count scaling for the benchmark suite.
+
+Callgrind is 20-50x, so the O(rows) benches at full N make the CI sweep too slow. `scaled(n)` shrinks row counts
+ONLY when `BENCH_SCALE=<divisor>` is set (which the CI sweep sets); unset -> full N, so local walltime A/B is
+unchanged. A gate bench and the engine floor it is compared against share a base N, so routing BOTH through
+`scaled()` keeps them at an identical scaled N and the binding fraction stays valid. Scaling reduces row counts
+only; it must never change the data patterns the benches depend on (real nulls, mixed ASCII, LIMIT-no-ORDER-BY).
+A floor keeps a scaled bench row-dominated so per-element work still dominates; the small-N `*_gate` probes are
+already fast and are NOT scaled.
+"""
+
+from __future__ import annotations
+
+import os
+
+FLOOR = 20_000  # a scaled bench never drops below this (stays row-dominated, ~10x the range(2048) probes)
+
+
+def bench_scale() -> int:
+    """Return the divisor from `BENCH_SCALE` (>=1); 1 (no scaling) if unset/invalid."""
+    v = os.environ.get("BENCH_SCALE")
+    if not v:
+        return 1
+    try:
+        return max(int(v), 1)
+    except ValueError:
+        return 1
+
+
+def scaled(n: int) -> int:
+    """Return `n` at full scale, or `max(n // BENCH_SCALE, min(n, FLOOR))` when scaling is enabled."""
+    d = bench_scale()
+    if d <= 1:
+        return n
+    return max(n // d, min(n, FLOOR))
diff --git a/benchmarks/baseline.json b/benchmarks/baseline.json
new file mode 100644
index 00000000..fe809300
--- /dev/null
+++ b/benchmarks/baseline.json
@@ -0,0 +1,972 @@
+{
+  "meta": {
+    "schema_version": 1,
+    "generated_at_utc": "2026-07-02T06:26:46+00:00",
+    "git_commit": "090e02142b1bca4163c526ad75a4dcc84a5ae374",
+    "duckdb_submodule_sha": "d9a775e4c03b23ecb3784f879196aa81adf0ac1c",
+    "requirements_bench_sha256": "2bdfd6a766947a61559afb2799c54f0ea173b9325f55082ad809bf7b97b2c659",
+    "measurement": {
+      "tool": "valgrind callgrind",
+      "event": "Ir",
+      "pythonhashseed": "0"
+    },
+    "bench_scale": "10",
+    "gate_default_threshold_pct": 5.0,
+    "binding_fraction_cutoff": 0.25,
+    "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above."
+  },
+  "benchmarks": {
+    "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[1000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 13968509,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[2]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 13117509,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[50000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 17445483,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 7507078,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_reader_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 8566385,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_read_arrow_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 16952462,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_arrow_reader_consumed": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29404937,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_arrow_table_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29199115,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_arrow_table_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 25884569,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_polars_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29363771,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_arrow_perf.py::test_write_polars_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 29278882,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[10000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 49732326,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[1000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 32634030,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[100]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 31130130,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_df[20000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 68677642,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[10000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 61656223,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[1000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 32870219,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[100]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 30241645,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_fetchall[20000]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 93837059,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[10000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 41073162,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[1000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31192384,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[100]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 30319144,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[20000]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 51996785,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_100k": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 3255412,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_200k": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 3253716,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_small": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 2855767,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_engine_control_perf.py::test_engine_sum_2col_500k": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31312283,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_2int": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 30527833,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_2int_gate": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 5144687,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_blob": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 46799205,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_decimal128": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 234831861,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_double": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 28100940,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_int": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18885980,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_int_gate": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 3207318,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_mixed": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 298310717,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_mixed_wide": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 629847376,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_null_heavy": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18497920,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_smallint": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18158437,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_str": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 36630015,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchall_timestamptz": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 442013591,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchmany_batched": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 44376635,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_fetch_perf.py::test_fetchone_iter": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 56082286,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_executemany_3col": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 20508999651,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_values_dicts": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 6300053057,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_values_scalars": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 4364660696,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_native_perf.py::test_ingest_values_tuples": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 5224666337,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_bind_analyzer_object": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21109327,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_dict_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5698722,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_double_with_nan": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 4441652,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_masked_int": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 4427922,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_object_string_mixed": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 71135312,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5978439,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 16958452,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 6253482,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31577228,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_numeric": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 31316827,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_string": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 27977539,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 29474196,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric_with_nulls": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40398312,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_string": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 69326603,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_timestamp": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21747493,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_decimal128": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 12498891,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_hugeint": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 7060301,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_numeric": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 29464799,
+      "binding_fraction": 0.0,
+      "threshold_pct": null
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_numeric_with_nulls": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40357060,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_string": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 69304377,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_timestamp": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21738267,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_df_uuid": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 215063593,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_fetch_df_chunk_loop": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 43497043,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 28165468,
+      "binding_fraction": 0.0,
+      "threshold_pct": null
+    },
+    "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric_with_nulls": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 35144943,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_relational_construction_perf.py::test_expr_many": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 64025731,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[bool]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 3639613,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[date]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5143666,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal128]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11654375,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal64]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 6088232,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[double]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 12398027,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[hugeint]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 6319959,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[int64]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 2512782,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[list]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11014392,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[struct]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 5119483,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[timestamp]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 10337048,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[uuid]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11291045,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_long]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 20944198,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_short]": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 11322686,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 3638394,
+      "binding_fraction": 0.1053,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 6410855,
+      "binding_fraction": 0.4922,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal128]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 12496882,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal64]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 6410024,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 12732237,
+      "binding_fraction": 0.7443,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[hugeint]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 7054469,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": {
+      "marker": "informational",
+      "source_marker": "gate",
+      "auto_moved": true,
+      "instructions": 2718974,
+      "binding_fraction": 0.0,
+      "threshold_pct": null
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[list]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 91324470,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[struct]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 110991217,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[timestamp]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 10647333,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[uuid]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 215166204,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_long]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40038336,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_short]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 28326808,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[bool]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 17981967,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[date]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 23701642,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal128]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 234148728,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal64]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 21656881,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[double]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 28070587,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[hugeint]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 159982348,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[int64]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 18836658,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[list]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 150499447,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[struct]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 119062526,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[timestamp]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 30750748,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[uuid]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 226484384,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_long]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 49637213,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_short]": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 33743613,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_arrow_double": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 102838074,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_udf_perf.py::test_udf_arrow_int": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 56453572,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_udf_perf.py::test_udf_arrow_null_inputs": {
+      "marker": "informational",
+      "source_marker": "informational",
+      "auto_moved": false,
+      "instructions": 72729269,
+      "binding_fraction": null,
+      "threshold_pct": null
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_double_1arg": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 40772497,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_int_1arg": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 35374345,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_int_2arg": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 48207658,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_null_inputs": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 26901535,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    },
+    "benchmarks/test_udf_perf.py::test_udf_native_string": {
+      "marker": "gate",
+      "source_marker": "gate",
+      "auto_moved": false,
+      "instructions": 93255939,
+      "binding_fraction": null,
+      "threshold_pct": 5.0
+    }
+  }
+}
diff --git a/benchmarks/compare_baseline.py b/benchmarks/compare_baseline.py
new file mode 100644
index 00000000..ab9773f9
--- /dev/null
+++ b/benchmarks/compare_baseline.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python3
+"""Committed-baseline instruction-count comparison for the benchmark suite. See benchmarks/README.md.
+
+pytest-codspeed's hooks call `callgrind_dump_stats_at(<uri>)` per benchmark, so callgrind writes ONE dump each,
+headed by `desc: Trigger: Client Request: <uri>` with the count on `totals:` (`events: Ir`). This parses those
+raw dumps directly (no CodSpeed account/token/runner). Run-to-run noise is ~0.1%, so the 5% gate threshold sits
+far above it (PYTHONHASHSEED pinned in CI).
+
+Two modes (CI-only; no valgrind on macOS arm64):
+  regen:   write baseline.json from a fresh run: counts + provenance + binding fractions + auto-move.
+  compare: diff a fresh run against baseline.json. Gate benches over threshold are regressions; informational
+           are reported only. Report-only by default; `--enforce` exits non-zero on a gate regression.
+
+baseline.json and benchmarks/requirements-bench.txt are regenerated together so counts match the frozen pins.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+SCHEMA_VERSION = 1
+GATE_DEFAULT_THRESHOLD_PCT = 5.0
+BINDING_FRACTION_CUTOFF = 0.25  # a gate whose isolable binding fraction is below this is auto-moved to
+#                                 informational (a threshold on its engine-diluted total is not meaningful).
+
+# Floor map: the engine-control bench that is the "engine floor" of a numeric-produce gate.
+# binding_fraction = 1 - floor_Ir / bench_Ir. ONLY numeric-produce benches are listed (their per-element binding
+# is a bulk memcpy of ~engine magnitude); every other gate is high-binding and needs no fraction. Add a mapping
+# (and, if needed, a floor) to evaluate more benches.
+_E = "benchmarks/test_engine_control_perf.py"
+FLOOR_MAP = {
+    "benchmarks/test_produce_numpy_perf.py::test_df_numeric": f"{_E}::test_engine_sum_2col_500k",
+    "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": f"{_E}::test_engine_sum_2col_500k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": f"{_E}::test_engine_sum_1col_100k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": f"{_E}::test_engine_sum_1col_100k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": f"{_E}::test_engine_sum_1col_100k",
+    "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": f"{_E}::test_engine_sum_1col_100k",
+}
+
+_TRIGGER_RE = re.compile(r"^desc:\s*Trigger:\s*Client Request:\s*(?P<uri>.+?)\s*$")
+_TOTALS_RE = re.compile(r"^totals:\s*(?P<ir>\d+)\s*$")
+
+
+# --------------------------------------------------------------------------- #
+# callgrind parsing
+# --------------------------------------------------------------------------- #
+
+
+def _normalize_uri(raw: str) -> str:
+    """Return a repo-relative benchmark key (strip a leading absolute path if the run was outside a git repo)."""
+    raw = raw.strip()
+    if "::" not in raw:
+        return raw
+    path, _, rest = raw.partition("::")
+    idx = path.find("benchmarks/")
+    if idx > 0:
+        path = path[idx:]
+    return f"{path}::{rest}"
+
+
+def parse_profiles(profile_dir: Path) -> dict[str, int]:
+    """Parse every callgrind dump in `profile_dir`; return {benchmark_uri: instruction_count}.
+
+    Keeps only dumps whose Trigger is a benchmark Client Request (contains `::`); skips metadata/termination
+    dumps. If a uri appears more than once (should not happen) the max is kept.
+    """
+    counts: dict[str, int] = {}
+    files = sorted(profile_dir.rglob("*")) if profile_dir.exists() else []
+    for f in files:
+        if not f.is_file():
+            continue
+        uri: str | None = None
+        ir: int | None = None
+        try:
+            text = f.read_text(errors="replace")
+        except (OSError, UnicodeError):
+            continue
+        for line in text.splitlines():
+            m = _TRIGGER_RE.match(line)
+            if m:
+                uri = _normalize_uri(m.group("uri"))
+                continue
+            m = _TOTALS_RE.match(line)
+            if m:
+                ir = int(m.group("ir"))
+        if uri and "::" in uri and ir is not None:
+            counts[uri] = max(counts.get(uri, 0), ir)
+    return counts
+
+
+# --------------------------------------------------------------------------- #
+# helpers
+# --------------------------------------------------------------------------- #
+
+
+def _sha256(path: Path) -> str:
+    return hashlib.sha256(path.read_bytes()).hexdigest() if path.exists() else ""
+
+
+def _load_gate_set(gate_list: Path | None) -> set[str]:
+    """Load the set of gate benchmark uris from a `pytest -m gate --collect-only -q` node-id list."""
+    if not gate_list or not gate_list.exists():
+        return set()
+    out = set()
+    for raw in gate_list.read_text().splitlines():
+        line = raw.strip()
+        if "::" in line:  # a pytest node-id (the workflow pre-filters the collect-only output to '::' lines)
+            out.add(_normalize_uri(line))
+    return out
+
+
+def _pct(base: int, new: int) -> float:
+    return 0.0 if base == 0 else (new - base) / base * 100.0
+
+
+# --------------------------------------------------------------------------- #
+# regen
+# --------------------------------------------------------------------------- #
+
+
+def regen(args: argparse.Namespace) -> int:
+    """Write baseline.json from a valgrind run: counts + provenance + Option-B binding fractions/auto-move."""
+    counts = parse_profiles(Path(args.profiles))
+    if not counts:
+        print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr)
+        return 2
+    gate_set = _load_gate_set(Path(args.gate_list) if args.gate_list else None)
+
+    benches: dict[str, dict] = {}
+    auto_moved: list[str] = []
+    for uri, ir in sorted(counts.items()):
+        source_marker = "gate" if uri in gate_set else "informational"
+        marker = source_marker
+        binding_fraction = None
+        floor_uri = FLOOR_MAP.get(uri)
+        if source_marker == "gate" and floor_uri and floor_uri in counts and ir > 0:
+            binding_fraction = round(max(0.0, 1.0 - counts[floor_uri] / ir), 4)
+            if binding_fraction < args.cutoff:
+                marker = "informational"  # Option-B auto-move: engine-diluted, threshold not meaningful
+                auto_moved.append(uri)
+        benches[uri] = {
+            "marker": marker,
+            "source_marker": source_marker,
+            "auto_moved": marker != source_marker,
+            "instructions": ir,
+            "binding_fraction": binding_fraction,
+            "threshold_pct": GATE_DEFAULT_THRESHOLD_PCT if marker == "gate" else None,
+        }
+
+    baseline = {
+        "meta": {
+            "schema_version": SCHEMA_VERSION,
+            "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
+            "git_commit": args.git_commit,
+            "duckdb_submodule_sha": args.submodule_sha,
+            "requirements_bench_sha256": _sha256(Path(args.pins)) if args.pins else "",
+            "measurement": {"tool": "valgrind callgrind", "event": "Ir", "pythonhashseed": "0"},
+            "bench_scale": os.environ.get("BENCH_SCALE", ""),  # counts are only comparable at the same scale
+            "gate_default_threshold_pct": GATE_DEFAULT_THRESHOLD_PCT,
+            "binding_fraction_cutoff": args.cutoff,
+            "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above.",
+        },
+        "benchmarks": benches,
+    }
+    Path(args.out).write_text(json.dumps(baseline, indent=2) + "\n")
+    n_gate = sum(1 for b in benches.values() if b["marker"] == "gate")
+    n_info = len(benches) - n_gate
+    print(f"Wrote {args.out}: {len(benches)} benchmarks ({n_gate} gate, {n_info} informational).")
+    if auto_moved:
+        print(f"Option-B auto-moved {len(auto_moved)} engine-diluted gate(s) to informational:")
+        for uri in auto_moved:
+            print(f"  {uri}  (binding_fraction={benches[uri]['binding_fraction']})")
+        print("Recommend updating these benches' @pytest.mark.gate -> informational so code matches the baseline.")
+    return 0
+
+
+# --------------------------------------------------------------------------- #
+# compare
+# --------------------------------------------------------------------------- #
+
+
+def compare(args: argparse.Namespace) -> int:
+    """Diff a fresh valgrind run against baseline.json and print a report (report-only unless --enforce)."""
+    new_counts = parse_profiles(Path(args.profiles))
+    if not new_counts:
+        print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr)
+        return 2
+    baseline_path = Path(args.baseline)
+    if not baseline_path.exists():
+        # Bootstrap state: no committed baseline yet. Report the run and instruct to regenerate; never fail.
+        print(f"No baseline at {baseline_path} yet -- run the workflow with regen=true to create it.")
+        print(f"This run produced {len(new_counts)} benchmark instruction counts.")
+        return 0
+    baseline = json.loads(baseline_path.read_text())
+    meta = baseline.get("meta", {})
+    base_benches = baseline.get("benchmarks", {})
+
+    # scale guard: a baseline built at BENCH_SCALE=X is only comparable to a run at the same scale.
+    run_scale = os.environ.get("BENCH_SCALE", "")
+    base_scale = meta.get("bench_scale", "")
+    if run_scale != base_scale:
+        print(
+            f"WARNING: BENCH_SCALE differs (run={run_scale!r}, baseline={base_scale!r}) -> instruction counts are "
+            "not comparable. Regenerate the baseline at this scale."
+        )
+
+    # pin-drift guard: the baseline's counts only compare cleanly against the pinned data libs it was built with.
+    if args.pins:
+        cur = _sha256(Path(args.pins))
+        base_pins = meta.get("requirements_bench_sha256", "")
+        if cur and base_pins and cur != base_pins:
+            print(
+                "WARNING: benchmarks/requirements-bench.txt differs from the baseline's pins -> data-lib deltas "
+                "may not be pure binding. Regenerate the baseline with the current pins."
+            )
+
+    # engine-bump guard: engine-inclusive counts shift when the DuckDB submodule changes. If the SHA differs from
+    # the baseline's, don't treat gate deltas as hard failures (they may reflect the bump); warn to regenerate.
+    engine_changed = bool(
+        args.submodule_sha and meta.get("duckdb_submodule_sha") and args.submodule_sha != meta["duckdb_submodule_sha"]
+    )
+
+    regressions: list[str] = []
+    rows: list[tuple[str, str, str]] = []  # (status, uri, detail)
+    for uri, ir in sorted(new_counts.items()):
+        b = base_benches.get(uri)
+        if b is None:
+            rows.append(("NEW", uri, f"{ir} Ir (no baseline)"))
+            continue
+        base_ir = b["instructions"]
+        delta = _pct(base_ir, ir)
+        marker = b.get("marker", "informational")
+        thr = b.get("threshold_pct") or GATE_DEFAULT_THRESHOLD_PCT
+        detail = f"{base_ir} -> {ir} Ir  ({delta:+.2f}%, thr {thr:.1f}%, {marker})"
+        if marker == "gate" and delta > thr:
+            if engine_changed:
+                rows.append(("ENGINE?", uri, detail + "  [submodule changed -> not enforced]"))
+            else:
+                rows.append(("REGRESSION", uri, detail))
+                regressions.append(uri)
+        else:
+            rows.append(("ok" if marker == "gate" else "info", uri, detail))
+    rows.extend(
+        ("MISSING", uri, "in baseline, absent from run (rename/removal?)")
+        for uri in sorted(set(base_benches) - set(new_counts))
+    )
+
+    _print_report(meta, rows, engine_changed=engine_changed, enforce=args.enforce)
+
+    if not args.enforce:
+        return 0
+    if engine_changed:
+        print("\nNOT ENFORCING: DuckDB submodule differs from the baseline; regenerate the baseline.")
+        return 0
+    return 1 if regressions else 0
+
+
+def _print_report(meta: dict, rows: list[tuple[str, str, str]], *, engine_changed: bool, enforce: bool) -> None:
+    mode = "ENFORCING" if enforce else "REPORT-ONLY (not failing the job)"
+    print("=" * 100)
+    print(f"CodSpeed instruction-count baseline comparison  [{mode}]")
+    print(
+        f"baseline: commit {meta.get('git_commit', '?')[:12]}  submodule {str(meta.get('duckdb_submodule_sha'))[:12]}"
+        f"  generated {meta.get('generated_at_utc', '?')}"
+    )
+    if engine_changed:
+        print(
+            "WARNING: DuckDB submodule SHA differs from the baseline -> engine-inclusive deltas may reflect the "
+            "engine bump, not the binding. Regenerate the baseline for this engine."
+        )
+    print("=" * 100)
+    order = {"REGRESSION": 0, "ENGINE?": 1, "MISSING": 2, "NEW": 3, "ok": 4, "info": 5}
+    for status, uri, detail in sorted(rows, key=lambda r: (order.get(r[0], 9), r[1])):
+        print(f"  [{status:>10}] {uri}\n               {detail}")
+    n_reg = sum(1 for s, _, _ in rows if s == "REGRESSION")
+    print("-" * 100)
+    print(f"Summary: {len(rows)} benchmarks, {n_reg} gate regression(s)" + ("" if enforce else "  (report-only)"))
+
+
+# --------------------------------------------------------------------------- #
+# cli
+# --------------------------------------------------------------------------- #
+
+
+def main(argv: list[str] | None = None) -> int:
+    """CLI entry point: dispatch to the `regen` or `compare` subcommand."""
+    p = argparse.ArgumentParser(description=__doc__)
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    r = sub.add_parser("regen", help="write baseline.json from a valgrind run")
+    r.add_argument("--profiles", required=True, help="CODSPEED_PROFILE_FOLDER with callgrind dumps")
+    r.add_argument("--out", default="benchmarks/baseline.json")
+    r.add_argument("--gate-list", help="file of gate node-ids (pytest -m gate --collect-only -q)")
+    r.add_argument("--git-commit", default="")
+    r.add_argument("--submodule-sha", default="")
+    r.add_argument("--pins", default="benchmarks/requirements-bench.txt")
+    r.add_argument("--cutoff", type=float, default=BINDING_FRACTION_CUTOFF)
+    r.set_defaults(func=regen)
+
+    c = sub.add_parser("compare", help="compare a valgrind run against baseline.json")
+    c.add_argument("--profiles", required=True)
+    c.add_argument("--baseline", default="benchmarks/baseline.json")
+    c.add_argument("--submodule-sha", default="")
+    c.add_argument(
+        "--pins", default="benchmarks/requirements-bench.txt", help="warn if pins differ from the baseline's"
+    )
+    c.add_argument("--enforce", action="store_true", help="exit non-zero on a gate regression (default: report-only)")
+    c.set_defaults(func=compare)
+
+    args = p.parse_args(argv)
+    return args.func(args)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
new file mode 100644
index 00000000..07cba4e4
--- /dev/null
+++ b/benchmarks/conftest.py
@@ -0,0 +1,42 @@
+"""Shared fixtures + marker registration for the benchmark suite. See benchmarks/README.md.
+
+Markers are registered here (not via pyproject `markers=`) because pyproject sets `filterwarnings = ["error"]`,
+so an unregistered mark would raise as a collection error. Every benchmark must carry EXACTLY ONE of `gate` /
+`informational` so the two CI steps (`-m gate`, `-m informational`) cover the suite with no overlap.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import bench_scale, scaled  # noqa: F401  (re-exported as the shared home; used by the modules)
+
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Register the gate/informational markers (required under filterwarnings=error)."""
+    config.addinivalue_line(
+        "markers",
+        "gate: binding-dominated, instruction-count gate-able under Callgrind (deterministic).",
+    )
+    config.addinivalue_line(
+        "markers",
+        "informational: engine/library-diluted or streaming; reported, never gated.",
+    )
+
+
+@pytest.fixture
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    """Yield a fresh single-threaded connection, closed on teardown.
+
+    `threads=1` pins engine parallelism so counts/walltime don't shift with the runner core count. The
+    concurrency module overrides this deliberately.
+    """
+    c = duckdb.connect(config={"threads": 1})
+    yield c
+    c.close()
diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt
new file mode 100644
index 00000000..8a9f49e6
--- /dev/null
+++ b/benchmarks/requirements-bench.txt
@@ -0,0 +1,23 @@
+# Frozen pins for the benchmark suite: freezing the data libs means the only cross-run delta is the binding.
+# Regenerate DELIBERATELY, together with the baseline. Source of truth: the `[dependency-groups] bench` list in
+# pyproject.toml (torch/tensorflow deliberately absent, local-only via importorskip). Regenerate with:
+#   uv pip compile pyproject.toml --group bench \
+#     --python-version 3.13 --python-platform x86_64-unknown-linux-gnu \
+#     --no-annotate --no-header -o benchmarks/requirements-bench.txt
+iniconfig==2.3.0
+markdown-it-py==4.2.0
+mdurl==0.1.2
+numpy==2.5.0
+packaging==26.2
+pandas==3.0.3
+pluggy==1.6.0
+polars==1.42.1
+polars-runtime-32==1.42.1
+pyarrow==24.0.0
+pygments==2.20.0
+pytest==9.1.1
+pytest-codspeed==5.0.3
+python-dateutil==2.9.0.post0
+pytz==2026.2
+rich==15.0.0
+six==1.17.0
diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py
new file mode 100644
index 00000000..de05f78e
--- /dev/null
+++ b/benchmarks/test_arrow_perf.py
@@ -0,0 +1,134 @@
+"""Arrow read/write: Table + RecordBatchReader + dictionary sweep. See benchmarks/README.md.
+
+READ aggregates over real columns (arrow answers count(*) from metadata); WRITE drains the lazy reader.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+import pytest
+from _scale import scaled
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+N = scaled(500_000)
+DICT_UNIQUE = [2, 1_000, 50_000]  # UNIQUE-value counts (cardinality sweep), not row counts -> NOT scaled
+WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
+WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
+
+# informational: every bench here is engine-parallel or library/streaming dominated. READ = engine aggregate
+# dominates; WRITE (to_arrow/pl) re-runs the query GIL-released. Would trip on engine/submodule bumps, not binding.
+pytestmark = pytest.mark.informational
+
+
+@pytest.fixture(scope="module")
+def arrow_numeric() -> pa.Table:
+    return pa.table(
+        {
+            "a": pa.array(range(N), type=pa.int64()),
+            "b": pa.array([i * 1.5 for i in range(N)], type=pa.float64()),
+        }
+    )
+
+
+@pytest.fixture(scope="module")
+def arrow_string() -> pa.Table:
+    return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())})
+
+
+@pytest.fixture(scope="module")
+def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.RecordBatch]]:
+    # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round
+    return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000)
+
+
+@pytest.fixture(scope="module")
+def arrow_dict_tables() -> dict[int, pa.Table]:
+    # deterministic indices (i % U) so the instruction count is reproducible (no PRNG)
+    tables = {}
+    for u in DICT_UNIQUE:
+        uniques = pa.array([f"category_value_{i}" for i in range(u)], type=pa.string())
+        idx = pa.array(np.arange(N, dtype="int32") % u, type=pa.int32())
+        tables[u] = pa.table({"c": pa.DictionaryArray.from_arrays(idx, uniques)})
+    return tables
+
+
+# READ: arrow -> duckdb. sum/length force a full scan.
+
+
+def test_read_arrow_numeric(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric: pa.Table
+) -> None:
+    con.register("t_num", arrow_numeric)
+    con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall())
+
+
+def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None:
+    con.register("t_str", arrow_string)
+    con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall())
+
+
+def test_read_arrow_reader_numeric(
+    benchmark: BenchmarkFixture,
+    con: duckdb.DuckDBPyConnection,
+    arrow_numeric_batches: tuple[pa.Schema, list[pa.RecordBatch]],
+) -> None:
+    # same factory as the Table read, but STREAMING: a fresh reader per round, drained by the engine
+    schema, batches = arrow_numeric_batches
+
+    def run() -> list:
+        reader = pa.RecordBatchReader.from_batches(schema, iter(batches))
+        con.register("t_rdr", reader)
+        return con.execute("SELECT sum(a), sum(b) FROM t_rdr").fetchall()
+
+    run()  # warm
+    benchmark(run)
+
+
+@pytest.mark.parametrize("unique", DICT_UNIQUE)
+def test_read_arrow_dictionary(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_dict_tables: dict[int, pa.Table], unique: int
+) -> None:
+    # per-value dictionary DECODE cost slopes with the unique count (mirrors core test_arrow_dictionaries_scan)
+    con.register("t_dict", arrow_dict_tables[unique])
+    con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall())
+
+
+# WRITE: duckdb -> arrow, consumer fully materializes / drains the stream.
+
+
+def test_write_arrow_table_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table())
+
+
+def test_write_arrow_table_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table())
+
+
+def test_write_arrow_reader_consumed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    def run() -> int:
+        reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000)
+        rows = 0
+        for batch in reader:  # drain the lazy stream so duckdb produces every batch
+            rows += batch.num_rows
+        return rows
+
+    benchmark(run)
+
+
+def test_write_polars_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_NUM).pl())
+
+
+def test_write_polars_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_STR).pl())
diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py
new file mode 100644
index 00000000..751c6cf5
--- /dev/null
+++ b/benchmarks/test_cardinality_perf.py
@@ -0,0 +1,64 @@
+"""Result-cardinality (rows-to-Python) sweep via LIMIT n, no ORDER BY. See benchmarks/README.md.
+
+`SELECT * FROM src LIMIT n` early-stops the scan, so per-row conversion dominates and the slope is monotone in n.
+A steeper slope on one build is a per-row conversion regression. n=100 is overhead, n=100_000 is throughput.
+(An ORDER BY version was dropped: the top-N sort swamped the signal.)
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+import duckdb
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from pytest_codspeed import BenchmarkFixture
+
+# scale the source rows AND the top-N by the same factor, keeping small-N points fixed and SRC_ROWS >= max(LIMITS).
+SRC_ROWS = scaled(200_000)
+LIMITS = [100, 1_000, 10_000, scaled(100_000)]
+
+
+@pytest.fixture(scope="module")
+def con() -> Iterator[duckdb.DuckDBPyConnection]:
+    # source materialized ONCE (module-scoped) and identical across the n sweep; per-test build would add noise
+    c = duckdb.connect(config={"threads": 1})
+    c.execute(
+        "CREATE TABLE src AS "
+        f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)"
+    )
+    yield c
+    c.close()
+
+
+def _query(n: int) -> str:
+    return f"SELECT a, b, s FROM src LIMIT {n}"
+
+
+@pytest.mark.gate  # fetchall materializes n rows -> binding-dominated; small-n end is the noise-free gate
+@pytest.mark.parametrize("n", LIMITS)
+def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
+    q = _query(n)
+    con.execute(q).fetchall()  # warm
+    benchmark(lambda: con.execute(q).fetchall())
+
+
+@pytest.mark.gate  # df() materializes n rows to numpy columns -> binding-dominated
+@pytest.mark.parametrize("n", LIMITS)
+def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
+    q = _query(n)
+    con.sql(q).df()  # warm
+    benchmark(lambda: con.sql(q).df())
+
+
+@pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated
+@pytest.mark.parametrize("n", LIMITS)
+def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None:
+    q = _query(n)
+    con.sql(q).to_arrow_table()  # warm
+    benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_concurrency_perf.py b/benchmarks/test_concurrency_perf.py
new file mode 100644
index 00000000..c55b0274
--- /dev/null
+++ b/benchmarks/test_concurrency_perf.py
@@ -0,0 +1,111 @@
+"""Concurrency / GIL pressure across thread counts. Walltime-only, never gated. See benchmarks/README.md.
+
+The ONE dimension the single-threaded rest of the suite cannot see: Python objects threading through PARALLEL
+core execution. Primary signal is LOCAL WALLTIME:
+  * scan benches  -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup.
+  * native UDF    -> ~flat scaling = the GIL tax on per-row Python calls.
+  * arrow UDF     -> observed NEGATIVE scaling (per-chunk convert + GIL contention).
+
+Under CI Callgrind threads are serialized, so wall-clock contention is invisible there; the deterministic count
+still captures per-batch Produce GIL calls + UDF dispatch. Never gated either way.
+
+GOTCHA: a SINGLE-BATCH arrow table does NOT parallelize (one batch = one serial scan unit). The arrow scan bench
+MUST use a MULTI-BATCH table AND a CPU-heavy aggregate (a cheap sum is bandwidth-bound and won't parallelize).
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+import duckdb
+from duckdb.sqltypes import BIGINT
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+pa = pytest.importorskip("pyarrow")
+pc = pytest.importorskip("pyarrow.compute")
+import numpy as np  # noqa: E402  (after importorskip, matching the suite convention)
+import pandas as pd  # noqa: E402
+
+pytestmark = pytest.mark.informational
+
+N_SCAN = 1_000_000
+BATCH = 20_000  # -> 50 record batches; MULTI-BATCH required for the arrow scan to parallelize (see GOTCHA)
+N_UDF_NATIVE = 200_000  # native UDF = one Python call per row; keep modest (Callgrind instruments every call)
+N_UDF_ARROW = 1_000_000  # arrow UDF = one call per chunk (vectorized)
+THREADS = [1, 4, 8]
+
+# CPU-heavy aggregate so the parallel scan engages worker threads. The binding signal is the per-batch Produce
+# GIL handoff.
+HEAVY = "sin(a) * cos(b) + sqrt(abs(a)) + ln(abs(a) + 1)"
+
+
+@pytest.fixture(scope="module")
+def arrow_multibatch() -> pa.Table:
+    a = pa.array(np.arange(N_SCAN), type=pa.int64())
+    b = pa.array(np.arange(N_SCAN, dtype="float64") * 1.5, type=pa.float64())
+    return pa.Table.from_batches(pa.table({"a": a, "b": b}).to_batches(max_chunksize=BATCH))
+
+
+@pytest.fixture(scope="module")
+def pandas_frame() -> pd.DataFrame:
+    return pd.DataFrame({"a": np.arange(N_SCAN), "b": np.arange(N_SCAN, dtype="float64") * 1.5})
+
+
+# Parallel SCAN: arrow batches / pandas chunks pulled through the binding by engine worker threads; the scan
+# Produce acquires/releases the GIL per batch across threads.
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.Table, threads: int) -> None:
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.register("t", arrow_multibatch)
+        q = f"SELECT sum({HEAVY}) FROM t"
+        con.execute(q).fetchall()  # warm
+        benchmark(lambda: con.execute(q).fetchall())
+    finally:
+        con.close()
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.DataFrame, threads: int) -> None:
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.register("t", pandas_frame)
+        q = f"SELECT sum({HEAVY}) FROM t"
+        con.execute(q).fetchall()  # warm
+        benchmark(lambda: con.execute(q).fetchall())
+    finally:
+        con.close()
+
+
+# Parallel UDF: the engine scans a MATERIALIZED table (range() does not parallelize) and invokes a Python UDF
+# from multiple worker threads. Native = per-row call under the GIL (GIL tax); arrow = per-chunk convert.
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_NATIVE}) s(i)")  # materialized -> parallel scan
+        con.create_function("pyf", lambda x: (x * 2 + 1) % 97, [BIGINT], BIGINT)
+        con.execute("SELECT sum(pyf(a)) FROM t").fetchall()  # warm
+        benchmark(lambda: con.execute("SELECT sum(pyf(a)) FROM t").fetchall())
+    finally:
+        con.close()
+
+
+@pytest.mark.parametrize("threads", THREADS)
+def test_udf_arrow_parallel(benchmark: BenchmarkFixture, threads: int) -> None:
+    con = duckdb.connect(config={"threads": threads})
+    try:
+        con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_ARROW}) s(i)")  # materialized -> parallel scan
+        con.create_function("af", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+        con.execute("SELECT sum(af(a)) FROM t").fetchall()  # warm
+        benchmark(lambda: con.execute("SELECT sum(af(a)) FROM t").fetchall())
+    finally:
+        con.close()
diff --git a/benchmarks/test_engine_control_perf.py b/benchmarks/test_engine_control_perf.py
new file mode 100644
index 00000000..faee4de4
--- /dev/null
+++ b/benchmarks/test_engine_control_perf.py
@@ -0,0 +1,50 @@
+"""Pure-engine floor (no Python egress): the binding-fraction reference. See benchmarks/README.md.
+
+`SELECT sum(...) FROM range(N)` aggregates to one scalar, so the fetch is negligible: these measure SQL compile +
+the engine aggregate with ~zero per-row egress. Comparing a produce/fetch bench against the matching-N floor here
+quantifies how much of its cost is binding vs engine. Informational (they measure the engine), never gated.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+pytestmark = pytest.mark.informational
+
+# N matched to the benches these floor, and routed through scaled() with the SAME base N, so the floor and its
+# bench stay at an identical scaled N and the binding fraction stays valid. The 2048 small-N floor is NOT scaled.
+Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)"  # small-N gate floor (compile-dominated)
+Q_1C_100K = f"SELECT sum(i::BIGINT) FROM range({scaled(100_000)}) t(i)"  # types-matrix numeric-df floor
+Q_1C_200K = f"SELECT sum(i::BIGINT) FROM range({scaled(200_000)}) t(i)"  # fetch / native-UDF floor
+Q_2C_500K = (  # produce/ingest floor
+    f"SELECT sum(a), sum(b) FROM (SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({scaled(500_000)}) t(i))"
+)
+
+
+def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
+    con.execute(query).fetchall()  # warm
+    benchmark(lambda: con.execute(query).fetchall())
+
+
+def test_engine_sum_1col_small(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench(benchmark, con, Q_1C_SMALL)
+
+
+def test_engine_sum_1col_100k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench(benchmark, con, Q_1C_100K)
+
+
+def test_engine_sum_1col_200k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench(benchmark, con, Q_1C_200K)
+
+
+def test_engine_sum_2col_500k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench(benchmark, con, Q_2C_500K)
diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py
new file mode 100644
index 00000000..1aa5f4fe
--- /dev/null
+++ b/benchmarks/test_fetch_perf.py
@@ -0,0 +1,120 @@
+"""OUT-row fetch: fetchall, fetchone/fetchmany loops, wide/expensive scalar types. See benchmarks/README.md."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+# gate: OUT-row fetch materializes every row to Python (binding-dominated); the range() scan is cheap.
+pytestmark = pytest.mark.gate
+
+# scaled() shrinks N under BENCH_SCALE in the CI sweep; full N locally. The range(2048) *_gate probes are the
+# compile+fetch fixed-cost baseline and are deliberately NOT scaled.
+N_ROW = scaled(200_000)  # numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128)
+N_STR = scaled(100_000)  # varchar/blob/mixed-wide/timestamptz + fetchone/fetchmany loops
+N_NEST = scaled(50_000)  # heterogeneous scalar/list/struct row
+
+
+def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
+    con.execute(query).fetchall()  # warm the engine before measuring
+    benchmark(lambda: con.execute(query).fetchall())
+
+
+def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a FROM range({N_ROW}) t(i)")
+
+
+def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT (i % 100)::INTEGER AS a FROM range({N_ROW}) t(i)")
+
+
+def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT (i * 1.5)::DOUBLE AS a FROM range({N_ROW}) t(i)")
+
+
+def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range({N_ROW}) t(i)")
+
+
+def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT ('str_value_' || i) AS s FROM range({N_STR}) t(i)")
+
+
+def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    query = (
+        "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, "
+        f"{{'a': i, 'b': i + 1}} AS st FROM range({N_NEST}) t(i)"
+    )
+    _bench_fetchall(benchmark, con, query)
+
+
+def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)"
+
+    def run() -> None:
+        rel = con.execute(query)
+        while rel.fetchone() is not None:
+            pass
+
+    benchmark(run)
+
+
+# small-N *_gate variants: at range(2048) the measured region is ~60% SQL compile + engine, ~40% fetch, so these
+# catch a fixed-cost regression (not a pure per-row one). Plus expensive scalar types (timestamptz pytz-per-row,
+# blob, null-heavy), a heterogeneous per-cell-dispatch row, and the batched fetchmany loop.
+
+
+def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)")
+
+
+def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)")
+
+
+def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range({N_ROW}) t(i)")
+
+
+def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(
+        benchmark, con, f"SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range({N_STR}) t(i)"
+    )
+
+
+def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range({N_ROW}) t(i)")
+
+
+def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_fetchall(benchmark, con, f"SELECT ('blob_value_' || i)::BLOB FROM range({N_STR}) t(i)")
+
+
+def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    # heterogeneous row: per-cell type dispatch in the Fetchone loop (distinct branch/cache profile from the
+    # homogeneous single-type columns above)
+    query = (
+        "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, "
+        f"((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range({N_STR}) t(i)"
+    )
+    _bench_fetchall(benchmark, con, query)
+
+
+def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)"
+
+    def run() -> None:
+        rel = con.execute(query)
+        while True:
+            rows = rel.fetchmany(10_000)
+            if not rows:
+                break
+
+    benchmark(run)
diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py
new file mode 100644
index 00000000..3478ea1c
--- /dev/null
+++ b/benchmarks/test_ingest_native_perf.py
@@ -0,0 +1,85 @@
+"""Native Python-object ingest: values() list/tuple/dict, executemany. See benchmarks/README.md.
+
+Every cell goes through TransformPythonValue; dicts recurse to STRUCT; executemany re-binds per row. Note: one
+list arg to values() is ONE row whose columns are the list items, so a list of N items transforms N cells.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+EXECMANY_N = scaled(20_000)  # executemany re-binds + executes per row, keep moderate
+WIDE_N = scaled(10_000)  # values() builds a 1-row x N-col relation; cap N so the binder stays sane
+
+# gate: native ingest eagerly transforms every cell / re-binds per row; the engine side is negligible.
+pytestmark = pytest.mark.gate
+
+
+@pytest.fixture(scope="module")
+def rows_3col() -> list[tuple[int, float, str]]:
+    return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)]
+
+
+@pytest.fixture(scope="module")
+def scalars_wide() -> list[int]:
+    return list(range(WIDE_N))
+
+
+@pytest.fixture(scope="module")
+def tuples_wide() -> list[tuple[int, int, int]]:
+    return [(i, i + 1, i + 2) for i in range(WIDE_N)]
+
+
+@pytest.fixture(scope="module")
+def dicts_wide() -> list[dict[str, int | str]]:
+    return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)]
+
+
+# executemany: bind + execute one parameter set per row, into a real table (CREATE OR REPLACE so it doesn't grow).
+
+
+def test_ingest_executemany_3col(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, rows_3col: list[tuple[int, float, str]]
+) -> None:
+    con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
+    con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)  # warm
+
+    def run() -> None:
+        con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)")
+        con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col)
+
+    benchmark(run)
+
+
+# values(): EAGER per-cell TransformPythonValue. Drain with fetchall to complete the round-trip.
+
+
+def test_ingest_values_scalars(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, scalars_wide: list[int]
+) -> None:
+    con.values(scalars_wide).fetchall()  # warm
+    benchmark(lambda: con.values(scalars_wide).fetchall())
+
+
+def test_ingest_values_tuples(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, tuples_wide: list[tuple[int, int, int]]
+) -> None:
+    # each tuple cell -> LIST value (TransformPythonValue recursion)
+    con.values(tuples_wide).fetchall()  # warm
+    benchmark(lambda: con.values(tuples_wide).fetchall())
+
+
+def test_ingest_values_dicts(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, dicts_wide: list[dict[str, int | str]]
+) -> None:
+    # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion)
+    con.values(dicts_wide).fetchall()  # warm
+    benchmark(lambda: con.values(dicts_wide).fetchall())
diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py
new file mode 100644
index 00000000..61244d2c
--- /dev/null
+++ b/benchmarks/test_ingest_numpy_perf.py
@@ -0,0 +1,111 @@
+"""numpy ingest: object-string scan, NaN-to-NULL, masked scan, analyzer bind. See benchmarks/README.md.
+
+Gotchas: the object-string bench MUST mix ASCII + non-ASCII + a null or it misses the transcode ladder (see
+README traps); analyzer bind is the one place count(*) is correct (cost is at bind, not scan).
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+import numpy as np
+import pandas as pd
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+# scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below.
+N = scaled(500_000)
+ANALYZER_N = scaled(200_000)
+
+NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}
+
+# mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only)
+_MIXED = ["ascii_value_", "café_", "naïve_", "日本語_", None]
+_MIXED_STRINGS = [None if _MIXED[i % 5] is None else f"{_MIXED[i % 5]}{i}" for i in range(N)]
+
+# mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind
+_MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)]
+
+# READ (sum over a registered frame) is engine-aggregate dominated -> informational. The analyzer BIND (count(*),
+# no scan) is a pure per-bind binding cost -> gate.
+
+
+@pytest.fixture(scope="module")
+def df_double_with_nan() -> pd.DataFrame:
+    a = np.arange(N, dtype="float64") * 1.5
+    a[::10] = np.nan  # real NaNs -> NaN-to-NULL conversion loop
+    return pd.DataFrame({"a": a})
+
+
+@pytest.fixture(scope="module")
+def df_object_string_mixed() -> pd.DataFrame:
+    return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)})
+
+
+@pytest.fixture(scope="module")
+def df_masked_int() -> pd.DataFrame:
+    # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask
+    arr = pd.array(np.arange(N), dtype="Int64")
+    arr[::10] = pd.NA
+    return pd.DataFrame({"a": arr})
+
+
+@pytest.fixture(scope="module")
+def df_object_mixed_types() -> pd.DataFrame:
+    return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)})
+
+
+# READ: numpy -> duckdb. sum/length force a full scan.
+
+
+@pytest.mark.informational
+def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.register("npdict", NPDICT)  # register explicitly, not via replacement-scan frame inspection
+    con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall())
+
+
+@pytest.mark.informational
+def test_read_numpy_double_with_nan(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame
+) -> None:
+    con.register("t", df_double_with_nan)
+    con.execute("SELECT sum(a) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
+
+
+@pytest.mark.informational
+def test_read_numpy_masked_int(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame
+) -> None:
+    con.register("t", df_masked_int)
+    con.execute("SELECT sum(a) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall())
+
+
+@pytest.mark.informational
+def test_read_numpy_object_string_mixed(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame
+) -> None:
+    con.register("t", df_object_string_mixed)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
+
+
+# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY: the cost is at bind, so forcing a scan would
+# drown the per-bind signal. Re-binds the object column each call.
+
+
+@pytest.mark.gate
+def test_bind_analyzer_object(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame
+) -> None:
+    con.register("t", df_object_mixed_types)
+    con.execute("SELECT count(*) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall())
diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py
new file mode 100644
index 00000000..4edc78dc
--- /dev/null
+++ b/benchmarks/test_pandas_perf.py
@@ -0,0 +1,133 @@
+"""pandas read/write, numpy-backed vs arrow-backed frames. See benchmarks/README.md.
+
+Column backing selects the path: numpy-backed -> NumpyArray scan; arrow-backed (ArrowDtype) -> zero-copy arrow.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pyarrow as pa
+import pytest
+from _scale import scaled
+
+import numpy as np
+import pandas as pd
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+N = scaled(500_000)
+WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
+WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
+_STRINGS = [f"str_value_{i}" for i in range(N)]
+
+# READ (sum over a registered frame) is engine-aggregate dominated -> informational. Only the NUMPY-backed df()
+# WRITE is binding-dominated -> gate; the arrow-backed WRITE goes through pyarrow's to_pandas -> informational.
+
+
+@pytest.fixture(scope="module")
+def df_numpy_numeric() -> pd.DataFrame:
+    return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5})
+
+
+@pytest.fixture(scope="module")
+def df_numpy_string() -> pd.DataFrame:
+    # explicit object dtype -> the reworked numpy-backed object-string / analyzer path
+    return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)})
+
+
+@pytest.fixture(scope="module")
+def df_arrow_numeric() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())),
+            "b": pd.array(np.arange(N) * 1.5, dtype=pd.ArrowDtype(pa.float64())),
+        }
+    )
+
+
+@pytest.fixture(scope="module")
+def df_arrow_string() -> pd.DataFrame:
+    return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))})
+
+
+# READ: pandas -> duckdb. sum/length force a full scan.
+
+
+@pytest.mark.informational
+def test_read_pandas_numpy_numeric(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame
+) -> None:
+    con.register("t", df_numpy_numeric)
+    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
+
+
+@pytest.mark.informational
+def test_read_pandas_numpy_string(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame
+) -> None:
+    con.register("t", df_numpy_string)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
+
+
+@pytest.mark.informational
+def test_read_pandas_arrow_numeric(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame
+) -> None:
+    con.register("t", df_arrow_numeric)
+    con.execute("SELECT sum(a), sum(b) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall())
+
+
+@pytest.mark.informational
+def test_read_pandas_arrow_string(
+    benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame
+) -> None:
+    con.register("t", df_arrow_string)
+    con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()  # warm
+    benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall())
+
+
+# WRITE: duckdb -> pandas. df() is the reworked numpy-backed path; the arrow-backed frame goes via
+# duckdb-arrow + pyarrow.to_pandas(ArrowDtype). Both eagerly materialize the whole frame.
+
+
+@pytest.mark.gate
+def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_NUM).df())
+
+
+@pytest.mark.gate
+def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_STR).df())
+
+
+@pytest.mark.gate
+def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    # REAL nulls -> the masked_array build + masked-to-pd.NA rewrite the cutover reworked (see README traps)
+    q = (
+        "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
+        f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)"
+    )
+    benchmark(lambda: con.sql(q).df())
+
+
+@pytest.mark.gate
+def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    q = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)"
+    benchmark(lambda: con.sql(q).df())
+
+
+@pytest.mark.informational  # to_pandas() half is pyarrow library code
+def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
+
+
+@pytest.mark.informational  # to_pandas() half is pyarrow library code
+def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype))
diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py
new file mode 100644
index 00000000..f7a103da
--- /dev/null
+++ b/benchmarks/test_produce_numpy_perf.py
@@ -0,0 +1,148 @@
+"""Columnar produce: df(), fetchnumpy(), fetch_df_chunk(), per type, null vs no-null. See benchmarks/README.md.
+
+Covers the with-NULLS masked_array branch, datetime, and wide-internal types (hugeint/uuid/decimal128).
+"""
+
+from __future__ import annotations
+
+import gc
+import sys
+import tracemalloc
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+import duckdb
+import numpy as np  # noqa: F401  (pinned identically A/B so the env matches the other modules)
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+N = scaled(500_000)
+TYPE_N = scaled(200_000)  # wide-internal types (hugeint/uuid/decimal128) are heavier per cell
+
+Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)"
+Q_NUM_NULLS = (
+    "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, "
+    f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)"
+)
+Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)"
+Q_TS = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)"
+Q_HUGEINT = f"SELECT (i::HUGEINT * 1000000000000) AS h FROM range({TYPE_N}) t(i)"
+Q_UUID = f"SELECT gen_random_uuid() AS u FROM range({TYPE_N}) t(i)"
+Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)"
+
+
+# gate: df()/fetchnumpy() fully materialize numpy-backed columns (ArrayWrapper fill, binding-dominated).
+def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
+    con.sql(query).df()  # warm
+    benchmark(lambda: con.sql(query).df())
+
+
+def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
+    con.sql(query).fetchnumpy()  # warm
+    benchmark(lambda: con.sql(query).fetchnumpy())
+
+
+# df(): the production numpy-backed columnar path. no-null vs REAL-null vs string vs timestamp vs wide types.
+
+
+@pytest.mark.informational
+def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_NUM)
+
+
+@pytest.mark.gate
+def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_NUM_NULLS)  # REAL nulls -> masked_array branch (see README traps)
+
+
+@pytest.mark.gate
+def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_STR)
+
+
+@pytest.mark.gate
+def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_TS)
+
+
+@pytest.mark.gate
+def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_HUGEINT)
+
+
+@pytest.mark.gate
+def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_UUID)
+
+
+@pytest.mark.gate
+def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_df(benchmark, con, Q_DEC128)
+
+
+# fetchnumpy(): same FetchNumpyInternal, without the DataFrame wrap.
+
+
+@pytest.mark.informational
+def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_numpy(benchmark, con, Q_NUM)
+
+
+@pytest.mark.gate
+def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    _bench_numpy(benchmark, con, Q_NUM_NULLS)
+
+
+@pytest.mark.informational  # per-chunk streaming drain (GIL-per-chunk), not gated
+def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    def run() -> int:
+        rel = con.sql(Q_NUM)
+        rows = 0
+        while True:
+            chunk = rel.fetch_df_chunk()
+            if len(chunk) == 0:
+                break
+            rows += len(chunk)
+        return rows
+
+    con.sql(Q_NUM).fetch_df_chunk()  # warm
+    benchmark(run)
+
+
+@pytest.mark.informational  # torch is local-only (importorskip); torch lib work dilutes it
+def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    pytest.importorskip("torch")
+    q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)"
+    con.sql(q).torch()  # warm
+    benchmark(lambda: con.sql(q).torch())
+
+
+# Memory guard (secondary signal, not a codspeed benchmark; codspeed walltime tracks neither memory nor allocs).
+# tracemalloc captures the PEAK Python-tracked allocation of ONE df()-with-nulls call. reset_peak() runs AFTER
+# the warm so the warm does not set a high-water mark that swallows the measured call. tracemalloc reports bytes
+# on every platform (portable to Linux CI). CAVEAT: it only sees Python-level allocs, not the C numpy buffers, so
+# it catches a gross Python-object blowup (masked-to-pd.NA gone wrong) but is not a total-RSS gate; that is
+# codspeed memory mode's job (deferred, see PLAN.md).
+
+
+def test_mem_df_with_nulls() -> None:
+    con = duckdb.connect(config={"threads": 1})
+    try:
+        tracemalloc.start()
+        warm = con.sql(Q_NUM_NULLS).df()  # populate one-time import / type caches
+        del warm
+        gc.collect()
+        tracemalloc.reset_peak()  # discount the warm's transient peak BEFORE the measured call
+        out = con.sql(Q_NUM_NULLS).df()
+        _current, peak = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+        del out
+    finally:
+        con.close()
+    print(f"\n[mem] df()-with-nulls tracemalloc peak = {peak / 1e6:.1f} MB", file=sys.stderr)
+    # a 500k x 2-col masked df is a few MB of Python-tracked allocs; a gross blowup is tens+ MB. 100 MB ceiling
+    # catches that without flaking.
+    assert peak < 100_000_000
diff --git a/benchmarks/test_relational_construction_perf.py b/benchmarks/test_relational_construction_perf.py
new file mode 100644
index 00000000..bd494c2e
--- /dev/null
+++ b/benchmarks/test_relational_construction_perf.py
@@ -0,0 +1,31 @@
+"""Relational-API expression construction. Informational, out of the binding gate. See benchmarks/README.md.
+
+This is expression *construction* (ColumnExpression / ConstantExpression / operator overloads), not the
+binding-pressure surface the rest of the suite targets. Kept because it carries a real signal (a measured ~35%
+construction delta at the cutover), but never part of the gate.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+
+import duckdb
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+pytestmark = pytest.mark.informational
+
+
+def test_expr_many(benchmark: BenchmarkFixture) -> None:
+    def run() -> int:
+        out = []
+        for i in range(2000):
+            col = duckdb.ColumnExpression(f"col_{i}")
+            const = duckdb.ConstantExpression(i)
+            out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}"))
+        return len(out)
+
+    benchmark(run)
diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py
new file mode 100644
index 00000000..9cc8d6b3
--- /dev/null
+++ b/benchmarks/test_types_roundtrip_perf.py
@@ -0,0 +1,72 @@
+"""type x direction produce matrix: fetchall / df / to_arrow per logical type. See benchmarks/README.md.
+
+One logical type per column across three directions, so a regression localizes to (type, direction). Includes the
+wide types the narrow-numeric benches miss: hugeint, uuid, decimal128, long varchar.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+N = scaled(100_000)
+
+# one logical type per column; long-varchar is intentionally > 64 chars
+TYPE_EXPR = {
+    "int64": "i::BIGINT",
+    "double": "(i * 1.5)::DOUBLE",
+    "varchar_short": "('str_' || i)",
+    "varchar_long": "('row_' || i || '_' || repeat('payload ', 9))",
+    "date": "DATE '2020-01-01' + (i % 3650)::INTEGER",
+    "bool": "(i % 2 = 0)",
+    "timestamp": "TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND)",
+    "decimal64": "((i::DECIMAL(18, 3)) / 1000)",
+    "decimal128": "((i * 1.5)::DECIMAL(28, 6))",
+    "hugeint": "(i::HUGEINT * 1000000000000)",
+    "uuid": "gen_random_uuid()",
+    "struct": "{'a': i, 'b': i + 1}",
+    "list": "[i, i + 1, i + 2]",
+}
+TYPES = list(TYPE_EXPR)
+
+# OUT-col bool/int64 are engine-diluted below the Option-B cutoff (binding_fraction < 0.25, see baseline.json): the
+# numpy column fill is trivial next to the engine scan, so they are informational while the other types stay gate.
+# OUT-row is unaffected (fetchall builds a Python object per cell, binding-dominated for every type).
+_OUT_COL_DILUTED = {"bool", "int64"}
+_OUT_COL_PARAMS = [
+    pytest.param(t, marks=pytest.mark.informational if t in _OUT_COL_DILUTED else pytest.mark.gate) for t in TYPES
+]
+
+
+def _query(type_name: str) -> str:
+    return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)"
+
+
+@pytest.mark.gate  # OUT-row: binding-dominated per-type dispatch
+@pytest.mark.parametrize("type_name", TYPES)
+def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
+    q = _query(type_name)
+    con.execute(q).fetchall()  # warm
+    benchmark(lambda: con.execute(q).fetchall())
+
+
+@pytest.mark.parametrize("type_name", _OUT_COL_PARAMS)  # OUT-col: ArrayWrapper fill; gate per type except diluted ones
+def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
+    q = _query(type_name)
+    con.sql(q).df()  # warm
+    benchmark(lambda: con.sql(q).df())
+
+
+@pytest.mark.informational  # to_arrow_table re-runs the query GIL-released (engine-parallel, noisy) -> not gated
+@pytest.mark.parametrize("type_name", TYPES)
+def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None:
+    q = _query(type_name)
+    con.sql(q).to_arrow_table()  # warm
+    benchmark(lambda: con.sql(q).to_arrow_table())
diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py
new file mode 100644
index 00000000..0f381ca7
--- /dev/null
+++ b/benchmarks/test_udf_perf.py
@@ -0,0 +1,103 @@
+"""Python UDFs: native scalar (one call per row) and vectorized arrow (one call per chunk). See benchmarks/README.md.
+
+Each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+from _scale import scaled
+
+from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR
+
+if TYPE_CHECKING:
+    from pytest_codspeed import BenchmarkFixture
+
+    import duckdb
+
+pa = pytest.importorskip("pyarrow")
+pc = pytest.importorskip("pyarrow.compute")
+
+NATIVE_N = scaled(200_000)  # native = one Python call per row, keep moderate
+ARROW_N = scaled(1_000_000)  # arrow = one Python call per chunk (vectorized), can be large
+
+
+def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None:
+    con.execute(query).fetchall()  # warm the engine + import caches
+    benchmark(lambda: con.execute(query).fetchall())
+
+
+# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result). The Python
+# call dominates; the sum() consume is negligible -> gate.
+
+
+@pytest.mark.gate
+def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
+    _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)")
+
+
+@pytest.mark.gate
+def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT)
+    _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)")
+
+
+@pytest.mark.gate
+def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE)
+    _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)")
+
+
+@pytest.mark.gate
+def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR)
+    _bench(
+        benchmark,
+        con,
+        f"SELECT sum(length(up(s))) FROM (SELECT ('str_value_' || i) AS s FROM range({NATIVE_N}) t(i))",
+    )
+
+
+@pytest.mark.gate
+def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    # DEFAULT null handling short-circuits NULL inputs (SetNull) WITHOUT calling the UDF: measures the validity
+    # short-circuit, so the UDF only ever sees non-NULL rows.
+    con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT)
+    _bench(
+        benchmark,
+        con,
+        "SELECT sum(add_one(v)) FROM "
+        f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({NATIVE_N}) t(i))",
+    )
+
+
+# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast. pyarrow lib
+# work + per-chunk conversion + 1M engine -> informational.
+
+
+@pytest.mark.informational
+def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+    _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)")
+
+
+@pytest.mark.informational
+def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow")
+    _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)")
+
+
+@pytest.mark.informational
+def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None:
+    # DEFAULT null handling on the vectorized path compacts the validity (selvec) before the call and reconstructs
+    # the result vector after: this measures the selvec compaction/reconstruction cost.
+    con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+    _bench(
+        benchmark,
+        con,
+        "SELECT sum(arrow_add_one(v)) FROM "
+        f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({ARROW_N}) t(i))",
+    )
diff --git a/pyproject.toml b/pyproject.toml
index 53cfa616..12cad096 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -250,6 +250,7 @@ test = [ # dependencies used for running tests
     "pytest-timeout",
     "pytest-timestamper",
     "pytest-xdist", # parallel test execution (-n auto); without this `uv sync --reinstall` prunes a manual install
+    "pytest_codspeed",
     "coverage",
     "gcovr; sys_platform != 'win32' or platform_machine != 'ARM64'",
     "gcsfs; sys_platform != 'win32' or platform_machine != 'ARM64'",
@@ -280,6 +281,20 @@ test = [ # dependencies used for running tests
     "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'",
     "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'",
 ]
+bench = [ # Pinned deps for the benchmark suite (see benchmarks/README.md). Minimal, not the heavy `test` group.
+          # Constraints mirror `test` so the lockfile resolves identically; torch/tf are local-only (importorskip).
+    "pytest",
+    "pytest_codspeed",
+    "polars>=1.33.0",
+    "pytz",
+    "numpy<2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version < '3.12'",
+    "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'",
+    "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'",
+    "pandas>=3.0.0; python_version > '3.10'",
+    "pandas<3.0.0; python_version < '3.11'",
+    "pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
+    "pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')",
+]
 scripts = [ # dependencies used for running scripts
     "cxxheaderparser",
     "ipython",
@@ -440,6 +455,10 @@ strict = true
     # No need for type hinting in tests
     'ANN001', 'ANN201', 'ANN202'
 ]
+"benchmarks/**.py" = [
+    # benchmarks are test-like: docstrings optional (shared context lives in benchmarks/README.md)
+    'D100', 'D101', 'D102', 'D103', 'D104', 'D105', 'D107',
+]
 "tests/fast/spark/**.py" = [
     "E402"
 ]
diff --git a/tests/fast/test_binding_pressure_leak.py b/tests/fast/test_binding_pressure_leak.py
new file mode 100644
index 00000000..1ffd596c
--- /dev/null
+++ b/tests/fast/test_binding_pressure_leak.py
@@ -0,0 +1,109 @@
+"""Sustained-iteration leak guards for the binding object-pinning paths.
+
+CodSpeed measures per-call cost and can't see a refcount imbalance in the object-pinning graph until it OOMs, so
+this plain assertion test runs each pinning path N times and asserts RSS and object growth stay flat. Covers what
+test_relation_dependency_leak.py does not: register/unregister, native + arrow UDF create/run/remove, executemany.
+"""
+
+import gc
+import os
+
+import pytest
+
+import numpy as np
+import pandas as pd
+
+try:
+    import pyarrow as pa
+
+    can_arrow = True
+except ImportError:
+    can_arrow = False
+
+from duckdb.sqltypes import BIGINT
+
+psutil = pytest.importorskip("psutil")
+
+ITERS = 100
+ROWS = 100_000
+_EM_ROWS = [(i, i * 1.5, f"s{i}") for i in range(5_000)]
+
+
+def _rss_gb():
+    return psutil.Process(os.getpid()).memory_info().rss / (10**9)
+
+
+def check_flat(fn, cursor, iters=ITERS, obj_slack=20_000):
+    """Assert RSS and tracked-object count stay flat across `iters` calls of `fn`."""
+    fn(cursor)  # warm one-time caches so they are not counted as growth
+    gc.collect()
+    start_rss = _rss_gb()
+    start_obj = len(gc.get_objects())
+    for _ in range(iters):
+        fn(cursor)
+    gc.collect()
+    end_rss = _rss_gb()
+    end_obj = len(gc.get_objects())
+    # RSS ratio bound mirrors test_relation_dependency_leak.py (growth must stay well under 3x)...
+    assert end_rss / 3 < start_rss, f"RSS grew {start_rss:.3f} -> {end_rss:.3f} GB over {iters} iters"
+    # ...plus an object-count bound, which catches a Python-object pin that is too small to move RSS.
+    assert end_obj - start_obj < obj_slack, f"tracked objects grew by {end_obj - start_obj} over {iters} iters"
+
+
+# --------------------------------------------------------------------------- #
+# Pinning paths (one full pin/unpin cycle per call).
+# --------------------------------------------------------------------------- #
+
+
+def register_unregister_arrow(cursor):
+    tbl = pa.table({"a": pa.array(np.arange(ROWS), type=pa.int64())})
+    cursor.register("t_reg", tbl)
+    cursor.execute("SELECT sum(a) FROM t_reg").fetchall()
+    cursor.unregister("t_reg")
+
+
+def register_unregister_pandas(cursor):
+    df = pd.DataFrame({"a": np.arange(ROWS)})
+    cursor.register("t_reg", df)
+    cursor.execute("SELECT sum(a) FROM t_reg").fetchall()
+    cursor.unregister("t_reg")
+
+
+def native_udf_cycle(cursor):
+    cursor.create_function("f_leak", lambda x: x + 1, [BIGINT], BIGINT)
+    cursor.execute("SELECT sum(f_leak(i::BIGINT)) FROM range(10000) t(i)").fetchall()
+    cursor.remove_function("f_leak")
+
+
+def arrow_udf_cycle(cursor):
+    import pyarrow.compute as pc
+
+    cursor.create_function("af_leak", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow")
+    cursor.execute("SELECT sum(af_leak(i::BIGINT)) FROM range(50000) t(i)").fetchall()
+    cursor.remove_function("af_leak")
+
+
+def executemany_cycle(cursor):
+    cursor.execute("CREATE OR REPLACE TABLE t_em (a BIGINT, b DOUBLE, c VARCHAR)")
+    cursor.executemany("INSERT INTO t_em VALUES (?, ?, ?)", _EM_ROWS)
+
+
+class TestBindingPressureLeak:
+    def test_register_unregister_arrow_leak(self, duckdb_cursor):
+        if not can_arrow:
+            pytest.skip("pyarrow not installed")
+        check_flat(register_unregister_arrow, duckdb_cursor)
+
+    def test_register_unregister_pandas_leak(self, duckdb_cursor):
+        check_flat(register_unregister_pandas, duckdb_cursor)
+
+    def test_native_udf_cycle_leak(self, duckdb_cursor):
+        check_flat(native_udf_cycle, duckdb_cursor)
+
+    def test_arrow_udf_cycle_leak(self, duckdb_cursor):
+        if not can_arrow:
+            pytest.skip("pyarrow not installed")
+        check_flat(arrow_udf_cycle, duckdb_cursor)
+
+    def test_executemany_leak(self, duckdb_cursor):
+        check_flat(executemany_cycle, duckdb_cursor)