duckdb · evertlammerts · Jul 2, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml
@@ -0,0 +1,129 @@
+# Instruction-count (Callgrind) perf-regression gate against a COMMITTED baseline. No CodSpeed account/token/runner:
+# compare_baseline.py parses raw callgrind dumps and diffs each benchmark against benchmarks/baseline.json. Counts
+# are near-deterministic with PYTHONHASHSEED pinned (~0.1% noise), so the 5% gate threshold sits far above it.
+# Details + rationale: benchmarks/README.md and benchmarks/PLAN.md.
+#
+# Triggers: nightly schedule + manual workflow_dispatch (no pull_request/push). A dispatch on a feature branch
+# compares that branch's counts vs the baseline.json committed on it, answering "did my branch regress vs main".
+#
+# Modes (workflow_dispatch input `regen`):
+#   regen=false (default) -> COMPARE + report. Report-only for now (never fails); flip to --enforce once trusted.
+#   regen=true            -> write a fresh baseline.json + upload as an artifact to commit deliberately. Bump
+#                            requirements-bench.txt FIRST (separate commit) if the pins should change.
+#
+# The concurrency module is excluded from the sweep (Callgrind serializes threads, so its signal is meaningless).
+# Memory mode (a second sweep for produce peak-RSS) is deferred (see PLAN.md).
+
+name: Benchmarks
+
+on:
+  schedule:
+    - cron: "0 3 * * *" # nightly at 03:00 UTC
+  workflow_dispatch:
+    inputs:
+      regen:
+        description: "Regenerate benchmarks/baseline.json (upload as artifact) instead of comparing"
+        type: boolean
+        default: false
+
+concurrency:
+  group: codspeed-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  benchmarks:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90 # ~25 min sweep at BENCH_SCALE=10 (12-core Linux) + ~10 min cold build; margin for CI
+    permissions:
+      contents: read
+    env:
+      PYTHONHASHSEED: "0" # stable instruction counts for dict/struct paths
+      CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks
+      # shrink the O(rows) benches so the sweep fits under timeout-minutes. Local runs leave this unset -> full N.
+      # Recorded in baseline.json meta.bench_scale; a baseline only compares to a run at the SAME scale.
+      BENCH_SCALE: "10"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive # the DuckDB engine submodule is needed to build
+          fetch-depth: 0 # setuptools_scm needs history for version detection
+
+      - name: Resolve DuckDB submodule SHA
+        id: duckdb_sha
+        # used for the sccache key AND passed to compare_baseline.py for the engine-bump guard
+        run: echo "sha=$(git rev-parse HEAD:external/duckdb)" >> "$GITHUB_OUTPUT"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.13"
+
+      - name: Install valgrind
+        run: sudo apt-get update && sudo apt-get install -y valgrind
+
+      - name: Cache sccache
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/sccache
+          key: sccache-codspeed-${{ steps.duckdb_sha.outputs.sha }}
+          restore-keys: sccache-codspeed-
+
+      - name: Install sccache
+        run: |
+          curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \
+            | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache
+
+      - name: Build the extension (release) + pinned benchmark deps
+        env:
+          CMAKE_C_COMPILER_LAUNCHER: sccache
+          CMAKE_CXX_COMPILER_LAUNCHER: sccache
+        run: |
+          # step 1: build deps only (needed for --no-build-isolation), no project
+          uv sync --only-group build --no-install-project -p 3.13
+          # step 2: the frozen bench pins (exact ==), so the only cross-run delta is the binding. MUST precede the
+          # build: numpy>=2.0 is a [build-system].requires (numpy C API headers), which --no-build-isolation does
+          # not auto-install and which is not in the `build` group, so CMake's find_package(... NumPy) fails first.
+          uv pip install -r benchmarks/requirements-bench.txt
+          # step 3: build+install the project (release), no default `dev` group (torch/tensorflow/pyspark). uv pip
+          # install is additive; uv sync here would prune numpy back out before the build and re-break the config.
+          uv pip install --no-build-isolation --no-deps --reinstall -C cmake.build-type=Release .
+
+      - name: Collect gate node-ids
+        # the gate/informational marker split; regen uses it to classify each benchmark
+        run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \
+             | grep '::' > gate_list.txt || true
+
+      - name: Run benchmarks under Callgrind (per-benchmark instruction counts)
+        # ONE sweep over gate+informational EXCEPT the concurrency module (thread-serialized, expensive). Each
+        # benchmark emits a callgrind dump keyed by its uri.
+        run: |
+          mkdir -p profiles
+          CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \
+            --callgrind-out-file="$PWD/profiles/cg.%p.%n" \
+            uv run --no-sync pytest benchmarks/ \
+              --ignore=benchmarks/test_concurrency_perf.py \
+              -m "gate or informational" --codspeed -o addopts= -p no:cacheprovider
+
+      - name: Compare against committed baseline (report-only)
+        if: ${{ !inputs.regen }}
+        # report-only: prints the delta table, never fails the job. Add --enforce once trusted.
+        run: |
+          uv run --no-sync python benchmarks/compare_baseline.py compare \
+            --profiles profiles --baseline benchmarks/baseline.json \
+            --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \
+            --pins benchmarks/requirements-bench.txt
+
+      - name: Regenerate baseline (upload artifact to commit deliberately)
+        if: ${{ inputs.regen }}
+        run: |
+          uv run --no-sync python benchmarks/compare_baseline.py regen \
+            --profiles profiles --out benchmarks/baseline.json --gate-list gate_list.txt \
+            --git-commit "${{ github.sha }}" --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \
+            --pins benchmarks/requirements-bench.txt
+
+      - name: Upload regenerated baseline
+        if: ${{ inputs.regen }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: baseline-update
+          path: benchmarks/baseline.json
diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml
@@ -30,7 +30,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python: [ cp311, cp314 ]
+        python: [ cp314 ]
         platform:
           - { os: windows-2022,     arch: amd64,      cibw_system: win }
           - { os: windows-11-arm,   arch: ARM64,      cibw_system: win }

diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md
@@ -0,0 +1,90 @@
+# Benchmark suite plan
+
+Design rationale for the binding micro-benchmarks. The suite is implemented in `benchmarks/`; CI lives in
+`../.github/workflows/codspeed.yml`; conventions, markers, and the two data-pattern traps are in
+[README.md](README.md).
+
+Priority: **P0** = known-regression or cutover-reworked path (narrow-numeric common case); **P1** = high-traffic
+conversion or per-element Python work; **P2** = correctness-relevant, lower-traffic or engine-dominated.
+
+## Scenarios
+
+PRODUCE (duckdb to Python) is the highest regression risk: `Fetchone` builds a `TupleBuilder` per row and calls
+`FromValue` per cell (O(rows x cols), the shape of the historical ~15% fetchall regression).
+
+- **OUT-row** (`test_fetch_perf`, `test_types_roundtrip_perf`): fetchall / fetchone / fetchmany per type. P0
+  narrow numeric; P1 varchar, list, struct, and the expensive per-row types (decimal `Decimal()`, timestamptz
+  pytz, hugeint string round-trip, uuid). Small-N `*_gate` probes isolate the compile+fetch fixed cost.
+- **OUT-col** (`test_produce_numpy_perf`): df() / fetchnumpy() reworked columnar path. P0 numeric no-null vs
+  REAL-null (the masked_array branch); plus string, timestamp, and wide-internal (hugeint/uuid/decimal128).
+- **OUT-arrow / polars** (`test_arrow_perf`): to_arrow_table / reader / pl(). Informational (engine-parallel,
+  GIL-released).
+- **Cardinality** (`test_cardinality_perf`): a LIMIT-n sweep giving a clean per-row conversion slope.
+
+INGEST (Python to duckdb):
+
+- **numpy / pandas** (`test_ingest_numpy_perf`, `test_pandas_perf`): numpy-backed scan (NaN-to-NULL, masked),
+  object-string transcode ladder, arrow-backed zero-copy, and the per-bind PandasAnalyzer.
+- **arrow** (`test_arrow_perf`): Table + RecordBatchReader + dictionary sweep.
+- **native** (`test_ingest_native_perf`): values() list/tuple/dict per-cell TransformPythonValue, executemany.
+
+UDF (`test_udf_perf`, zero coverage before this suite): native scalar per-row (P0, the biggest untested per-call
+path) and vectorized arrow per-chunk.
+
+## Type x direction matrix
+
+Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper),
+OUT-arrow.
+
+| Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow |
+|------|-----------|----------|---------|---------|-----------|
+| int32/int64 | P1 | **P0** | **P0** | **P0** | P1 |
+| double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 |
+| varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 |
+| bool | P2 | P1 | P2 | P1 | P2 |
+| decimal64/128 | P2 | n/a | **P1** (Python Decimal) | P1 | P2 |
+| date | P2 | P1 | P1 | P1 | P2 |
+| timestamp(tz) | P1 | P1 | **P1** (pytz/row) | P1 | P1 |
+| LIST/STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 |
+| hugeint/uuid | P2 | P2 | **P1** (round-trip) | P1 | P2 |
+| blob/map | P2 | P2 | P2 | P2 | P2 |
+| NULL-heavy | n/a | **P1** | P2 | **P0** (masked_array) | P1 |
+
+## Mechanics
+
+- **Walltime vs instruction-count.** Local A/B is walltime only (no Valgrind on macOS arm64). CI is
+  instruction-count via self-hosted Callgrind (near-deterministic, PYTHONHASHSEED pinned), diffed against a
+  committed baseline. Report-only until trusted.
+- **Marker split + auto-move.** Every benchmark is `gate` or `informational` (see README). At baseline regen,
+  each numeric-produce gate's binding fraction `= 1 - floor_Ir / bench_Ir` is computed against its engine floor
+  (`test_engine_control_perf`); a gate below the ~25% cutoff is auto-moved to informational (a threshold on an
+  engine-diluted total is not meaningful). OUT-row fetch and UDFs are ~all binding; numeric produce is a bulk
+  memcpy of ~engine magnitude (auto-move candidate).
+- **Guards.** compare_baseline.py warns and stops enforcing when BENCH_SCALE, the pin file, or the DuckDB
+  submodule SHA differ from the baseline's (any of those makes the counts non-comparable).
+- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain RSS + object-count test for the
+  object-pinning paths, since a per-call refcount imbalance is invisible to a steady-state benchmark.
+- **Memory mode** (a second Callgrind sweep for O(rows) produce peak-RSS) is designed but deferred; the
+  `test_mem_df_with_nulls` tracemalloc guard is the local stand-in.
+
+## Cross-check vs iqmo-org/bareduckdb
+
+Their suite is a SQL-file-driven A/B comparing two clients (production `duckdb` vs the C-API prototype), arrow-in
+/ arrow-out only, no fetchall/df/numpy/native/UDF coverage. So our binding suite is far broader; their genuine
+deltas concentrate in PRODUCE/types. Actionable additions they suggest:
+
+- **hugeint / uuid in the produce matrix** (they select both): OUT-row does a per-value string round-trip, distinct
+  from narrow int. Now in `test_produce_numpy_perf` / `test_fetch_perf`.
+- **int128-internal decimal** (`DECIMAL(28,x)`) alongside the int64-internal one: hits a wider cast path. Added.
+- **heterogeneous mixed-type row**: exercises per-cell type dispatch in the Fetchone loop, unlike homogeneous
+  columns. Added as `test_fetchall_mixed_wide`.
+- **long varchar (>64 char)** alongside the short string: shifts string copy / transcode toward copy-bound. Added
+  as `varchar_long` in the matrix.
+- **result-cardinality (top-N) sweep**: holds engine work ~constant while sweeping rows-to-Python. Adopted as
+  `test_cardinality_perf` (plain LIMIT, no ORDER BY; the sort swamped the signal).
+- **peak-memory guard** on the O(rows) produce paths: a conversion regression is often memory-shaped. Partially
+  covered by the tracemalloc guard; full coverage waits on memory mode.
+
+Out of scope (theirs, not adopted): pure-engine filter/group/window workloads; 100M+ row scale (IO/engine
+dominated); the free-threading category (unsupported by this client). Do NOT adopt their no-warmup single-run
+methodology (charges import-cache population into the measurement).
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,34 @@
+# Benchmark suite
+
+CodSpeed micro-benchmarks for the binding hot paths (produce, ingest, UDF).
+Design rationale: [PLAN.md](PLAN.md). CI: [../.github/workflows/codspeed.yml](../.github/workflows/codspeed.yml).
+
+## Markers
+
+Every benchmark carries exactly one (registered in `conftest.py`):
+
+- **gate**: binding-dominated, GIL-held, deterministic under Callgrind. A threshold breach is a binding regression.
+- **informational**: engine/library/streaming-diluted. Reported, never gated (would false-positive on engine bumps).
+
+## Local A/B (walltime)
+
+Only walltime runs locally (no Valgrind on macOS arm64; instruction-count gating is Linux/CI-only, and walltime is
+noisy on sub-ms benches). Pin the data libs identically across both builds so the delta is pure binding:
+
+```bash
+for P in ../main/.venv-release/bin/python .venv-release/bin/python; do
+  $P -m pytest benchmarks/<module>.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider
+done
+```
+
+## Conventions
+
+- READ aggregates real columns (`sum`/`length`), never `count(*)` (answered from metadata).
+- WRITE fully materializes the result or drains the lazy reader.
+- Warm once before measuring.
+- `con` fixture pins `threads=1` (see `conftest.py`).
+
+Two traps (a benchmark that skips these silently measures the wrong thing):
+
+- OUT-col null benches need REAL nulls (`CASE WHEN ... THEN NULL`), else the cheap `std::move` path is taken.
+- IN-numpy string benches need mixed ASCII + non-ASCII + a null sentinel, else the transcode/null ladder is skipped.
diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py
@@ -0,0 +1,35 @@
+"""Env-gated row-count scaling for the benchmark suite.
+
+Callgrind is 20-50x, so the O(rows) benches at full N make the CI sweep too slow. `scaled(n)` shrinks row counts
+ONLY when `BENCH_SCALE=<divisor>` is set (which the CI sweep sets); unset -> full N, so local walltime A/B is
+unchanged. A gate bench and the engine floor it is compared against share a base N, so routing BOTH through
+`scaled()` keeps them at an identical scaled N and the binding fraction stays valid. Scaling reduces row counts
+only; it must never change the data patterns the benches depend on (real nulls, mixed ASCII, LIMIT-no-ORDER-BY).
+A floor keeps a scaled bench row-dominated so per-element work still dominates; the small-N `*_gate` probes are
+already fast and are NOT scaled.
+"""
+
+from __future__ import annotations
+
+import os
+
+FLOOR = 20_000  # a scaled bench never drops below this (stays row-dominated, ~10x the range(2048) probes)
+
+
+def bench_scale() -> int:
+    """Return the divisor from `BENCH_SCALE` (>=1); 1 (no scaling) if unset/invalid."""
+    v = os.environ.get("BENCH_SCALE")
+    if not v:
+        return 1
+    try:
+        return max(int(v), 1)
+    except ValueError:
+        return 1
+
+
+def scaled(n: int) -> int:
+    """Return `n` at full scale, or `max(n // BENCH_SCALE, min(n, FLOOR))` when scaling is enabled."""
+    d = bench_scale()
+    if d <= 1:
+        return n
+    return max(n // d, min(n, FLOOR))