From 159757d8fa96e62a10a749804b34f682d69478c0 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Wed, 1 Jul 2026 07:22:21 +0200 Subject: [PATCH 1/7] Add CodSpeed perf-regression benchmark suite Nine modules over the duckdb-python binding hot paths: fetch (OUT-row), arrow, pandas, produce_numpy (df/fetchnumpy columnar), ingest_native (values/executemany), ingest_numpy (numpy scan + analyzer bind), udf (native + vectorized arrow), types_roundtrip (type x direction matrix), cardinality (LIMIT sweep). Full-consume discipline, warmup, real-null gotchas, tracemalloc memory guard. See benchmarks/PLAN.md. Standalone (not yet wired into CI). --- benchmarks/PLAN.md | 178 +++++++++++++++++++++++ benchmarks/test_arrow_perf.py | 119 ++++++++++++++++ benchmarks/test_cardinality_perf.py | 63 +++++++++ benchmarks/test_fetch_perf.py | 141 ++++++++++++++++++ benchmarks/test_ingest_native_perf.py | 93 ++++++++++++ benchmarks/test_ingest_numpy_perf.py | 109 ++++++++++++++ benchmarks/test_pandas_perf.py | 133 +++++++++++++++++ benchmarks/test_produce_numpy_perf.py | 181 ++++++++++++++++++++++++ benchmarks/test_types_roundtrip_perf.py | 78 ++++++++++ benchmarks/test_udf_perf.py | 110 ++++++++++++++ 10 files changed, 1205 insertions(+) create mode 100644 benchmarks/PLAN.md create mode 100644 benchmarks/test_arrow_perf.py create mode 100644 benchmarks/test_cardinality_perf.py create mode 100644 benchmarks/test_fetch_perf.py create mode 100644 benchmarks/test_ingest_native_perf.py create mode 100644 benchmarks/test_ingest_numpy_perf.py create mode 100644 benchmarks/test_pandas_perf.py create mode 100644 benchmarks/test_produce_numpy_perf.py create mode 100644 benchmarks/test_types_roundtrip_perf.py create mode 100644 benchmarks/test_udf_perf.py diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md new file mode 100644 index 00000000..c04f4801 --- /dev/null +++ b/benchmarks/PLAN.md @@ -0,0 +1,178 @@ +# CodSpeed Benchmark Suite Plan — duckdb-python binding hot paths + +Grounded in the binding source on `perf/codspeed` (`src/`). File:line citations are to this tree. + +## 0. Conventions (from the existing 3 modules, keep these) + +- Function-scoped `con` fixture; module-scoped input-data fixtures. +- READ = `SELECT sum(col) / sum(length(col))` (never `count(*)`, which is answered from metadata). +- WRITE = eager materialize or fully drain the lazy reader. +- Warm the engine once (`con.execute(query).fetchall()`) before `benchmark(...)` so first-call import-cache population is not charged to the measured region. +- Pin numpy/pandas/pyarrow/polars identically across A/B so deltas are pure binding cost. + +Ranking: **P0** = on a known regression path or the cutover-reworked code (narrow-numeric common case); **P1** = high-traffic conversion / per-element Python work; **P2** = correctness-relevant, lower traffic or engine-dominated. + +## (a) Prioritized scenarios + +### PRODUCE (duckdb -> external) — highest regression risk + +Row path: `DuckDBPyResult::Fetchone` (`src/pyresult.cpp:126-151`) builds a `PyUtil::TupleBuilder` (`src/include/duckdb_python/pyutil.hpp:101-125`) per row and calls `PythonObject::FromValue` (`src/native/python_objects.cpp:474`) per cell. O(rows x cols). This is the shape of the historical ~15% fetchall regression. + +| # | Scenario | SQL / setup | Measures | Pri | +|---|----------|-------------|----------|-----| +| P0-1 | fetchall int64 1col | `SELECT i::BIGINT a FROM range(1_000_000)` | TupleBuilder + FromValue int (`python_objects.cpp:489`) | P0 | +| P0-2 | fetchall int 2-4col | `SELECT i::BIGINT,(i+1)::BIGINT,(i*2)::INTEGER FROM range(1_000_000)` | TupleBuilder scaling w/ col count | P0 | +| P0-3 | fetchall double | `SELECT (i*1.5)::DOUBLE FROM range(1_000_000)` | FromValue double | P0 | +| P0-4 | fetchall varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | FromValue VARCHAR string copy (`python_objects.cpp:515`) | P1 | +| P0-5 | fetchone loop (overhead) | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(100_000)` | per-call Fetchone + chunk-boundary FetchNext + GIL cycle | P0 | +| P0-6 | fetchmany batched | as P0-5, `fetchmany(10_000)` loop | Fetchmany loop | P1 | +| P1-7 | **df() numeric (reworked)** | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(1_000_000)` | FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular, `HAS_NULLS=false/PANDAS=true` branch (`array_wrapper.cpp:415-425`) | P0 | +| P1-8 | **df() numeric WITH NULLS** | `SELECT CASE WHEN i%10=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | `HAS_NULLS=true` + masked_array build (`array_wrapper.cpp:743-757`) + masked->pd.NA rewrite (`pyresult.cpp:362-393`) | P0 | +| P1-9 | fetchnumpy numeric | as P1-7 | FetchNumpyInternal without the DataFrame wrap | P1 | +| P1-10 | df() varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | StringConvert PyUnicode_FromStringAndSize per row (`array_wrapper.cpp:164-181`) | P1 | +| P1-11 | df() timestamp | `SELECT TIMESTAMP '2020-01-01'+(i*INTERVAL 1 SECOND) FROM range(1_000_000)` | TimestampConvertNano + ConvertDateTimeTypes (`pyresult.cpp:299`) | P1 | +| P1-13 | to_record_batch_reader drained | `range(1_000_000)`, `to_record_batch_reader(100_000)` | lazy stream (`pyresult.cpp:573`), iterate + sum num_rows | P1 | +| P2-15 | torch()/tf() numeric | `range(500_000)` | FetchNumpyInternal + per-col from_numpy (`pyresult.cpp:405-421`) | P2 | +| P2-16 | fetch_df_chunk | large query, loop `fetch_df_chunk()` | FetchDFChunk per chunk (`pyresult.cpp:400`) | P2 | +| P1-17 | fetchall LIST | `SELECT [i,i+1,i+2] FROM range(200_000)` | FromValue LIST recursion (`python_objects.cpp:651`) | P1 | +| P1-18 | fetchall STRUCT | `SELECT {'a':i,'b':i+1} FROM range(200_000)` | FromStruct dict build (`python_objects.cpp:390-414`) | P1 | +| P1-20 | fetchall DECIMAL | `SELECT (i::DECIMAL(18,3))/1000 FROM range(200_000)` | Python `Decimal()(val.ToString())` per row (`python_objects.cpp:507`) | P1 | +| P1-21 | fetchall TIMESTAMPTZ | `SELECT (TIMESTAMPTZ '2020-01-01'+(i*INTERVAL 1 SECOND)) FROM range(100_000)` | pytz localize+astimezone per row (`python_objects.cpp:567-573`) | P1 | +| P2-22 | fetchall NULL-heavy | `SELECT CASE WHEN i%2=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | validity branch + nb::none (`pyresult.cpp:142`) | P2 | +| P2-23 | fetchall BLOB | `SELECT ('blob_'||i)::BLOB FROM range(200_000)` | nb::bytes (`python_objects.cpp:517`) | P2 | + +### INGEST (external -> duckdb) + +| # | Scenario | Setup | Path | Pri | +|---|----------|-------|------|-----| +| I0-1 | **pandas numpy int64/double** | DataFrame 1M | NumpyScan::Scan ScanNumpyMasked zero-copy when stride==sizeof(T); double NaN->NULL loop (`numpy_scan.cpp:76-112,236-246`) reworked | P0 | +| I0-2 | **pandas numpy object-string** | `pd.array(strings,dtype=object)` 500k | NumpyScan STRING/OBJECT: per-row isinstance, PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode transcode (`numpy_scan.cpp:353-452`) reworked | P0 | +| I1-3 | pandas object bind-time analyzer | object col 100k+ | Pandas::Bind -> PandasAnalyzer::Analyze samples rows GetItemType ladder (`analyzer.cpp:356-460`). Per-BIND overhead, independent of rows (count(*) ok here) | P1 | +| I1-4 | pandas arrow-backed | pd.ArrowDtype 1M | ToArrowTable -> arrow scan (`pyconnection.cpp:1799`) | P1 | +| I0-5 | arrow Table | 1M | CreateArrowScan PythonTableArrowArrayStreamFactory near-zero-copy (`python_replacement_scan.cpp:55-83`) | P1 | +| I1-6 | arrow RecordBatchReader | from_batches | same factory, streaming (distinct from Table) | P1 | +| I1-7 | polars DataFrame | 1M | entry.to_arrow() one-time + arrow scan (`replacement_scan.cpp:150-156`) | P2 | +| I1-8 | numpy ndarray + dict-of-arrays | np.arange | replacement scan -> pandas_scan (`replacement_scan.cpp:163-200`) | P2 | +| I1-9 | **native values() list-of-tuples** | `con.values([(i,i*1.5,'s') for i in range(100_000)])` | Values -> TransformPythonValue per cell, GetPythonObjectType ladder (`python_conversion.cpp:402-454,1075`) | P1 | +| I1-10 | native list-of-dicts | list of dicts | TransformDictionaryToStruct recursion (`python_conversion.cpp:119`) | P2 | +| I1-11 | executemany params | INSERT ?,? 100k sets | ExecuteMany loop, TransformPythonValue per set (`pyconnection.cpp:500-544`) | P2 | +| I2-12 | read_parquet/csv/json | a file | arg marshal -> TableFunction under GIL-release; engine-dominated | P2 | + +### UDF (`src/python_udf.cpp`) — zero coverage today + +| # | Scenario | Setup | Path | Pri | +|---|----------|-------|------|-----| +| U0-1 | **scalar native 1 int arg** | `def f(x):return x+1`, `SELECT sum(f(i::BIGINT)) FROM range(1_000_000)` | per-row TupleBuilder args + PyObject_CallObject + TransformPythonObject result (`python_udf.cpp:320-384`) | P0 | +| U0-2 | scalar native 2-3 args | `def f(a,b):return a+b` 2 cols 1M | arg-tuple scaling | P1 | +| U1-3 | scalar native string | `def f(s):return s.upper()` 500k | VARCHAR in + string out | P1 | +| U1-4 | scalar native NULL inputs | 50% NULL, DEFAULT handling | SetNull short-circuit (`python_udf.cpp:340-350`) | P1 | +| U1-6 | **vectorized arrow UDF** | `type='arrow'` pc.add 1M | ConvertDataChunkToPyArrowTable + call + ConvertArrowTableToVector cast (`python_udf.cpp:33-144,225`) | P0 | +| U2-7 | vectorized NULL slicing | DEFAULT + nulls | selvec compaction/reconstruction (`python_udf.cpp:197-305`) | P2 | + +## (b) Type x direction matrix + +Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper), OUT-arrow. + +| Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow | +|------|-----------|----------|---------|---------|-----------| +| int32/int64 | P1 | **P0** | **P0** | **P0** | P1 | +| double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 | +| varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 | +| bool | P2 | P1 | P2 | P1 | P2 | +| decimal | P2 | n/a | **P1** (Python Decimal) | P1 | P2 | +| date | P2 | P1 | P1 | P1 | P2 | +| timestamp | P1 | **P1** | P1 | P1 | P1 | +| timestamptz | P2 | P1 | **P1** (pytz/row) | P1 | P2 | +| time/interval | P2 | P1 | P1 | P1 | P2 | +| LIST/ARRAY | P2 | P2 | P1 (recursive) | P1 | P2 | +| STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 | +| MAP | P2 | P2 | P2 | P2 | P2 | +| blob | P2 | P2 | P2 | P2 | P2 | +| NULL-heavy | - | **P1** | P2 | **P0** (masked_array) | P1 | +| enum/category | - | P1 | P1 | P1 | P2 | + +Minimum viable to ship: int64, double, varchar, timestamp, decimal, LIST, STRUCT, NULL-heavy in OUT-row and OUT-col; int64/double/varchar in IN-numpy. + +## (c) Gaps vs the existing 3 modules + +Covered well: OUT-row narrow numeric, OUT-arrow/polars numeric+string, pandas IN/OUT numpy-vs-arrow numeric+string, fetchone-loop numeric. + +Missing: +1. **PRODUCE columnar reworked path under-covered** — df() only 500k, only numeric/string, never with NULLS (the masked-array branch is exactly what changed). Add df-with-nulls, fetchnumpy, df-timestamp. +2. **UDFs: zero coverage** — whole subsystem (python_udf.cpp), native per-row is the single biggest untested per-call-overhead path. Add U0-1/U0-2/U1-3/4/U1-6. +3. **Native Python ingest: zero coverage** — values()/list-of-tuples/list-of-dicts/executemany via TransformPythonValue. Add I1-9/10/11. +4. **Expensive scalar OUT-row types untested** — decimal, timestamptz, interval, isolated LIST/STRUCT/MAP. Add P1-17..21. +5. **Object-column bind-time analyzer untested** — PandasAnalyzer sampling, per-bind cost. Add I1-3. +6. **Size regimes thin** — add 1M throughput AND 1-row overhead variants. +7. **Arrow ingest only pa.table** — add RecordBatchReader, polars, numpy-ndarray ingest. +8. **NULL-heavy IN-numpy untested** (ScanNumpyMasked + ApplyMask). + +## (d) Suite organization + CodSpeed mechanics + +``` +benchmarks/ + test_fetch_perf.py # EXISTING — OUT-row. Add: nested, decimal, timestamptz, null-heavy, 1M+1-row + test_arrow_perf.py # EXISTING — add RecordBatchReader ingest, materialized vs stream + test_pandas_perf.py # EXISTING — add df()-with-nulls, datetime, fetchnumpy, analyzer bind + test_produce_numpy_perf.py # NEW — df()/fetchnumpy/fetch_df_chunk reworked columnar, per-type, null vs no-null + test_ingest_native_perf.py # NEW — values()/list-of-tuples/list-of-dicts/executemany + test_ingest_numpy_perf.py # NEW — numpy ndarray / object-string scan / analyzer bind + test_udf_perf.py # NEW — scalar native + vectorized arrow UDFs + test_types_roundtrip_perf.py # NEW — type x direction matrix sweep, parametrized +``` +One module per binding subsystem so a CodSpeed report points at one src/ area. torch/tf go in produce_numpy (wrap FetchNumpyInternal); polars stays in arrow (wraps FetchArrowTable). + +### Walltime vs instruction-count + +- **Local A/B (macOS arm64): walltime only** (no Valgrind), `--codspeed-mode=walltime`. +- **CI gate: instruction-count / simulation (Linux + Callgrind)**, deterministic — gate PRs with this. + +Instruction-count is ideal AND should gate the GIL-held single-threaded overhead paths: fetchone loop, fetchall/fetchmany, native UDF per-call, native values() ingest, analyzer bind, all per-element converters (FromValue, TransformPythonValue, NumpyScan object/string, ArrayWrapper fill). The historical fetchall regression would be caught cleanly here. + +Noisy under instruction-count — keep walltime-only, informational, do NOT hard-gate: +- to_arrow_table / pl() on materialized results: PromoteMaterializedToArrow re-runs the query parallel with GIL released (`pyresult.cpp:450-477`). +- Large 1M+ SELECT sum() ingest reads: engine parallel aggregate dominates. +- read_csv/parquet/json: engine + I/O dominated. +- GIL-per-chunk streaming (FetchNextRaw, to_record_batch_reader drain). + +Gate tactic: pair each large-throughput scenario with a small/1-row variant (e.g. fetchall range(1_000_000) walltime + fetchall range(2048) instruction-count gate) so binding fixed-cost is measured noise-free. + +### Two code-grounded gotchas +- **OUT-col null benchmarks need REAL DuckDB nulls** (`CASE WHEN ... THEN NULL`): the masked-array branch only triggers on an actually-invalid validity bit (`array_wrapper.cpp:396-404,736`); a no-null column silently takes the cheap `std::move` path and measures the wrong thing. +- **IN-numpy string benchmarks need mixed ASCII + non-ASCII + a NaN/pd.NA/None sentinel**: the scan zero-copies compact-ASCII (`numpy_scan.cpp:416-418`) but transcodes otherwise (`numpy_scan.cpp:429-446`); ASCII-only misses the transcode + null-detection ladder. + +## (e) Cross-check vs iqmo-org/bareduckdb + +Source read live from `iqmo-org/bareduckdb` `main`, subdir `benchmark/` (GitHub API + raw files). + +### What their suite covers / how it is organized + +A **SQL-file-driven A/B harness comparing two clients** — production `duckdb` vs `bareduckdb` (the C-API / free-threading prototype) — not a binding micro-bench. + +- `benchmark.py` orchestrates: discovers `cases/**/*.sql`, picks the matching `data/DATA*` dir, and runs each `(sql x parquet-file x db_mode)` as a fresh `uv run run_case.py` **subprocess**. `DBMODES=[duckdb, bareduckdb_capsule, bareduckdb_arrow]`; active `READ_MODES=[arrow_table]` (parquet/arrow_reader present but off). +- `run_case.py` per case: fresh `connect()`, `pyarrow.parquet.read_table(file)` + `conn.register(name, table)`, then `conn.sql(query).to_arrow_table()`, timed with `time.perf_counter()` and peak RSS via `resource.getrusage`. **No warmup, single run, result discarded.** Universal ingest = register(arrow table); universal produce = `to_arrow_table()`. +- `data/`: `DATA_RANGE` = single BIGINT `range(N)` at 5M / 100M; `DATA_CATEGORY_DATE_PRICE` = (VARCHAR category, DATE, BIGINT price) cross-join at 36M / 3.6B. +- `cases/`: `types/` (decimal `DECIMAL(28,12)`, hugeint `HUGEINT`, mixed_types `HUGEINT+uuid()+DECIMAL(28,6)+VARCHAR` in one row, timestamp `TIMESTAMP+INTERVAL`, varchar_long ~100-char), `limit/` (LIMIT 100 / 1k / 10k / 100k top-N — a result-cardinality sweep), `filter/`, `groups/`, `window/`, `threading/` (parallel group/window/self-join/registered-arrow-scan), plus a separate `stats/` harness. + +Their INGEST is arrow-only and their PRODUCE is arrow-only; they have **no** fetchall/fetchone, df()/numpy, pandas/numpy/native/polars ingest, or UDF coverage — so our binding suite is far broader on binding-specific surfaces. Their genuine deltas are concentrated in the PRODUCE/types dimension and in engine/threading workloads. + +### DELTA — actionable additions/changes + +- **[BINDING] Add HUGEINT to the produce matrix (currently absent).** `types/hugeint.sql`, `mixed_types.sql`. OUT-row `FromValue` HUGEINT does `PyLong_FromString(val.GetValue())` — a per-value string round-trip (`python_objects.cpp:500`), unlike narrow int; OUT-col casts hugeint->double (`array_wrapper.cpp:662`); OUT-arrow is a distinct decimal128/int128 export. Scenario: `SELECT i::HUGEINT FROM range(1_000_000)` through fetchall / df / to_arrow_table. Add a `hugeint` row to the type x direction matrix. +- **[BINDING] Add UUID to the produce matrix (absent).** `mixed_types.sql` selects `uuid()`. OUT-row builds a Python `uuid.UUID` per row (`python_objects.cpp:708-711`); OUT-col uses `UUIDConvert` (`array_wrapper.cpp:230-244`). Scenario: `SELECT gen_random_uuid() FROM range(200_000)` through fetchall / df / to_arrow_table. Add a `uuid` row to the matrix. +- **[BINDING] Add a 128-bit-internal DECIMAL variant.** Our P1-20 uses `DECIMAL(18,3)` (int64 internal); bareduckdb uses `DECIMAL(28,12)` / `(28,6)` (int128 internal), hitting `ConvertDecimalInternal` (`array_wrapper.cpp:571`) and the wider `PyDecimalCastSwitch`/`Decimal()` round-trip. Run both an int64-internal and an int128-internal decimal. +- **[BINDING] Add a heterogeneous mixed-type row (new scenario).** `SELECT i::HUGEINT, gen_random_uuid(), (i*1.5)::DECIMAL(28,6), ('string_'||i) FROM range(200_000)` through fetchall and df. Exercises per-cell type dispatch in the `Fetchone` column loop (`pyresult.cpp:140-148`) — a different branch/cache profile than our homogeneous columns (P0-1..3 are single-type). +- **[BINDING] Add a long-varchar (>64 char) variant** alongside the short `'str_value_'||i`. `'...'||repeat('data ',10)||i::VARCHAR` (~100 chars). Short strings are copy-cheap/overhead-bound; long strings shift OUT-row/OUT-col string copy and the IN-numpy `DecodePythonUnicode` transcode (`numpy_scan.cpp:429-446`) toward copy-bound. Apply to OUT-row, OUT-col, IN-numpy varchar scenarios. +- **[BINDING] Adopt their result-cardinality (top-N) sweep as a produce axis.** `SELECT * FROM ORDER BY k DESC LIMIT n` for n in {100, 1k, 10k, 100k}, fetched via fetchall / df / to_arrow_table with the source held constant. Holds engine work ~constant while sweeping rows-materialized-to-Python → a clean per-row conversion slope, and the small-n end is an ideal noise-free instruction-count gate (overhead regime). Cleaner than varying `range()` (which also changes scan cost). +- **[BINDING] Broaden the OUT-arrow column of the matrix.** Their entire produce path is `to_arrow_table`, and they push hugeint / decimal128 / uuid / timestamp / long-varchar / mixed-row through it — exactly the arrow-export converters (ArrowConverter/appender for int128/uuid/decimal128) our OUT-arrow column currently leaves at P1/P2 numeric+string. Add these types to OUT-arrow. +- **[BINDING, hard to gate] registered-arrow-scan under parallelism.** `threading/registered_arrow_scan.sql` pulls batches from `PythonTableArrowArrayStreamFactory::Produce` (binding code in `arrow/arrow_array_stream.cpp`) across engine threads holding/releasing the GIL — a real binding-contention risk. Keep as walltime-informational only; too noisy for an instruction-count gate. +- **[ENGINE] `filter` / `groups` / `window` / `self_join` pure-engine workloads** — out of scope for a binding gate; the binding only wraps them with register + to_arrow_table, and their consume (a small aggregate) is trivial so the measurement is ~pure engine. Note, do not add to the binding suite. +- **[ENGINE] 100M / 3.6B-row scale** — too slow / IO+engine-dominated / walltime-noisy for a codspeed gate; keep our regimes <= ~1M. +- **[ENGINE] threading / free-threading category** — the production client does not support free-threading (CLAUDE.md); deprioritize for this suite. + +### Methodology notes for our codspeed mechanics + +- **Adopt: result-cardinality (LIMIT) axis** (above) — a clean per-row conversion-cost slope and a natural small/large pairing for the instruction-count-gate-vs-walltime split already in (d). +- **Consider adopting: a peak-memory guard** for the O(rows) produce paths. bareduckdb tracks `getrusage` max RSS; codspeed walltime tracks neither memory nor allocations. A conversion regression is often memory-shaped (cf. the recorded fetchall +8% list->tuple edge-copy; the df() masked_array branch) — add a separate `getrusage`/memray delta assertion on `fetchall` and `df()`-with-nulls as a secondary signal, since a pure-timing gate can miss it. +- **Do NOT adopt their anti-patterns:** no-warmup + single subprocess run charges one-time import-cache population into the measurement and yields no statistics — bad for steady-state binding isolation. Our warmup + codspeed repeated rounds are correct; keep them. +- **Consistent with us:** their full-consume is eager `to_arrow_table()` and never `count(*)` — matches our discipline. Caveat: for their aggregate cases the arrow output is tiny, so the consume is trivial and the run is engine-only; our produce benchmarks must keep the materialization the heavy part (large output / top-N with large LIMIT). diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py new file mode 100644 index 00000000..e6fc43e0 --- /dev/null +++ b/benchmarks/test_arrow_perf.py @@ -0,0 +1,119 @@ +"""Standalone CodSpeed benchmark module for the Arrow read/write binding paths — NOT integrated +(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-cutover + $M -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts= + $C -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts= + +DESIGN — the data must be FULLY MOVED, not lazily wrapped, or the benchmark measures nothing: + * READ (arrow -> duckdb): the duckdb ENGINE must scan every value. We aggregate over the actual + columns (sum/length), NOT count(*) -- count(*) is answered from arrow metadata without touching data. + * WRITE (duckdb -> arrow): the CONSUMER must materialize everything. + - to_arrow_table() / pl() are EAGER (the full table / polars DataFrame is built). + - to_arrow_reader() is LAZY -- duckdb only produces a batch when it is pulled -- so we iterate the + whole stream to actually exercise and consume the write path. + +pyarrow/polars are pinned to the SAME version in both .venv-release, so the A/B delta is purely the binding. +""" + +import duckdb +import pyarrow as pa +import pytest + +N = 500_000 +WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)" +WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)" + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +@pytest.fixture(scope="module") +def arrow_numeric(): + return pa.table( + { + "a": pa.array(range(N), type=pa.int64()), + "b": pa.array([i * 1.5 for i in range(N)], type=pa.float64()), + } + ) + + +@pytest.fixture(scope="module") +def arrow_string(): + return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())}) + + +@pytest.fixture(scope="module") +def arrow_numeric_batches(arrow_numeric): + # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round + return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000) + + +# --------------------------------------------------------------------------- # +# READ: arrow -> duckdb. The engine must scan every value (sum/length force it). +# --------------------------------------------------------------------------- # + + +def test_read_arrow_numeric(benchmark, con, arrow_numeric): + con.register("t_num", arrow_numeric) + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()) + + +def test_read_arrow_string(benchmark, con, arrow_string): + con.register("t_str", arrow_string) + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()) + + +# ADDED: RecordBatchReader ingest -- the SAME PythonTableArrowArrayStreamFactory but STREAMING (distinct from +# the materialized Table read above). A fresh reader is built per round (the engine drains it); sum() forces a +# full scan of every value. + + +def test_read_arrow_reader_numeric(benchmark, con, arrow_numeric_batches): + schema, batches = arrow_numeric_batches + + def run(): + reader = pa.RecordBatchReader.from_batches(schema, iter(batches)) + con.register("t_rdr", reader) + return con.execute("SELECT sum(a), sum(b) FROM t_rdr").fetchall() + + run() # warm + benchmark(run) + + +# --------------------------------------------------------------------------- # +# WRITE: duckdb -> arrow, consumer fully materializes / fully drains the stream. +# --------------------------------------------------------------------------- # + + +def test_write_arrow_table_numeric(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table()) + + +def test_write_arrow_table_string(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table()) + + +def test_write_arrow_reader_consumed(benchmark, con): + def run(): + reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000) + rows = 0 + for batch in reader: # drain the lazy stream so duckdb actually produces every batch + rows += batch.num_rows + return rows + + benchmark(run) + + +def test_write_polars_numeric(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_NUM).pl()) + + +def test_write_polars_string(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_STR).pl()) diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py new file mode 100644 index 00000000..d4edda03 --- /dev/null +++ b/benchmarks/test_cardinality_perf.py @@ -0,0 +1,63 @@ +"""Standalone CodSpeed benchmark module: the RESULT-CARDINALITY (top-N) sweep — NOT integrated (not in +pyproject, not in CI, not committed). Run under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-codspeed + $M -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + $C -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + +WHY THIS MODULE (adopted from iqmo-org/bareduckdb): hold the SOURCE fixed and sweep only the number of rows +materialized to Python via ORDER BY ... LIMIT n for n in {100, 1k, 10k, 100k}, through fetchall / df / +to_arrow_table. The engine cost (scan the fixed SRC + top-N heap) stays ~constant, so the walltime delta +across n is dominated by the per-row binding conversion -> a clean per-row slope. The n=100 end is the +noise-free overhead regime (the natural instruction-count-gate point); the n=100k end is throughput. + +A clean monotone slope (and ~parity slope between the two builds) is the signal we report; a build whose slope +is steeper has a per-row conversion regression. Source held constant rules out scan-cost as the confound (a +cleaner axis than varying range(), which also changes scan cost). + +numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +""" + +import duckdb +import pytest + +SRC = 200_000 # fixed source size -> constant engine scan + top-N across all n +LIMITS = [100, 1_000, 10_000, 100_000] + +# 3 columns (BIGINT, DOUBLE, VARCHAR) so the per-row conversion is non-trivial; source is a fixed inline +# subquery (no table state) and ORDER BY forces a full scan + top-N of the same SRC rows every time. +_SRC_SUBQ = f"(SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC}) t(i))" + + +def _query(n): + return f"SELECT a, b, s FROM {_SRC_SUBQ} ORDER BY a DESC LIMIT {n}" + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +@pytest.mark.parametrize("n", LIMITS) +def test_limit_fetchall(benchmark, con, n): + q = _query(n) + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + + +@pytest.mark.parametrize("n", LIMITS) +def test_limit_df(benchmark, con, n): + q = _query(n) + con.sql(q).df() # warm + benchmark(lambda: con.sql(q).df()) + + +@pytest.mark.parametrize("n", LIMITS) +def test_limit_to_arrow(benchmark, con, n): + q = _query(n) + con.sql(q).to_arrow_table() # warm + benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py new file mode 100644 index 00000000..8c8ef20a --- /dev/null +++ b/benchmarks/test_fetch_perf.py @@ -0,0 +1,141 @@ +"""Standalone CodSpeed benchmark module — NOT integrated (not in pyproject, not in CI, not committed). + +Purpose: A/B the binding-layer perf between the two builds (pybind11 `main` vs nanobind cutover), in particular +the narrow-column `fetchall` regression. Run the SAME file under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-cutover + $M -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts= + $C -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts= + +NOTE: macOS arm64 has no Valgrind, so only `--codspeed-mode=walltime` works locally (wall-clock stats). The +deterministic instruction-count mode (`--codspeed-mode=simulation`) needs Linux + the CodSpeed instrument +(CI, or `codspeed run` in a Linux container). In CI/cloud, CodSpeed compares each run against a git baseline; +locally we get the same benchmark workflow but A/B by running the file under the two interpreters by hand. +""" + +import duckdb +import pytest + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +def _bench_fetchall(benchmark, con, query): + con.execute(query).fetchall() # warm the engine before measuring + benchmark(lambda: con.execute(query).fetchall()) + + +def test_fetchall_int(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(200000) t(i)") + + +def test_fetchall_smallint(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT (i % 100)::INTEGER AS a FROM range(200000) t(i)") + + +def test_fetchall_double(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT (i * 1.5)::DOUBLE AS a FROM range(200000) t(i)") + + +def test_fetchall_2int(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(200000) t(i)") + + +def test_fetchall_str(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT ('str_value_' || i) AS s FROM range(100000) t(i)") + + +def test_fetchall_mixed(benchmark, con): + query = ( + "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, " + "{'a': i, 'b': i + 1} AS st FROM range(50000) t(i)" + ) + _bench_fetchall(benchmark, con, query) + + +def test_fetchone_iter(benchmark, con): + query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)" + + def run(): + rel = con.execute(query) + while rel.fetchone() is not None: + pass + + benchmark(run) + + +# --------------------------------------------------------------------------- # +# ADDED: small-N instruction-count-gate variants (the narrow-numeric fixed-cost path, noise-free at range(2048) +# under simulation mode in CI), expensive scalar OUT-row types (timestamptz pytz-per-row, blob, null-heavy), a +# heterogeneous per-cell-dispatch row (hugeint+uuid+decimal128+varchar, distinct from homogeneous columns), and +# the batched fetchmany loop. +# --------------------------------------------------------------------------- # + + +def test_fetchall_int_gate(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)") + + +def test_fetchall_2int_gate(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)") + + +def test_fetchall_null_heavy(benchmark, con): + _bench_fetchall( + benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)" + ) + + +def test_fetchall_timestamptz(benchmark, con): + _bench_fetchall( + benchmark, con, "SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range(100000) t(i)" + ) + + +def test_fetchall_decimal128(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range(200000) t(i)") + + +def test_fetchall_blob(benchmark, con): + _bench_fetchall(benchmark, con, "SELECT ('blob_value_' || i)::BLOB FROM range(100000) t(i)") + + +def test_fetchall_mixed_wide(benchmark, con): + # heterogeneous row -> per-cell type dispatch in the Fetchone column loop (distinct branch/cache profile + # from the homogeneous single-type columns above) + query = ( + "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, " + "((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range(100000) t(i)" + ) + _bench_fetchall(benchmark, con, query) + + +def test_fetchmany_batched(benchmark, con): + query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)" + + def run(): + rel = con.execute(query) + while True: + rows = rel.fetchmany(10_000) + if not rows: + break + + benchmark(run) + + +def test_expr_many(benchmark): + def run(): + out = [] + for i in range(2000): + col = duckdb.ColumnExpression(f"col_{i}") + const = duckdb.ConstantExpression(i) + out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}")) + return len(out) + + benchmark(run) diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py new file mode 100644 index 00000000..4fca641a --- /dev/null +++ b/benchmarks/test_ingest_native_perf.py @@ -0,0 +1,93 @@ +"""Standalone CodSpeed benchmark module for NATIVE Python-object ingest (Python list/tuple/dict -> duckdb) — +NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-codspeed + $M -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + $C -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + +WHY THIS MODULE: native Python-object ingest had ZERO coverage. Every cell goes through TransformPythonValue +and the GetPythonObjectType ladder (python_conversion.cpp); dicts recurse through TransformDictionaryToStruct; +executemany re-binds a parameter set per row (pyconnection.cpp ExecuteMany loop). + +FULL MATERIALIZE: executemany lands N rows in a real table (CREATE OR REPLACE each round so the table does not +grow across codspeed's repeated invocations). values() builds the value vectors EAGERLY inside the call +(TransformPythonParamList), and we drain the resulting relation with fetchall so the round-trip is complete. + +NOTE on values() shape: a single list argument to values() becomes ONE row whose COLUMNS are the list items +(see DuckDBPyConnection::Values, pyconnection.cpp) -- so a list of N scalars is 1 row x N columns and runs +TransformPythonValue N times; a list of N tuples is 1 row x N nested(LIST) columns; a list of N dicts is +1 row x N STRUCT columns (TransformDictionaryToStruct). All three exercise the per-cell transform N times. +""" + +import duckdb +import pytest + +EXECMANY_N = 20_000 # executemany re-binds + executes per row, keep moderate +WIDE_N = 10_000 # values() builds a 1-row x N-col relation; cap N so the binder stays sane + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +@pytest.fixture(scope="module") +def rows_3col(): + return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)] + + +@pytest.fixture(scope="module") +def scalars_wide(): + return [i for i in range(WIDE_N)] + + +@pytest.fixture(scope="module") +def tuples_wide(): + return [(i, i + 1, i + 2) for i in range(WIDE_N)] + + +@pytest.fixture(scope="module") +def dicts_wide(): + return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)] + + +# --------------------------------------------------------------------------- # +# executemany: bind + execute one parameter set per row, into a real table. +# --------------------------------------------------------------------------- # + + +def test_ingest_executemany_3col(benchmark, con, rows_3col): + con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") + con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) # warm + + def run(): + con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") + con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) + + benchmark(run) + + +# --------------------------------------------------------------------------- # +# values(): EAGER per-cell TransformPythonValue. Drain with fetchall to complete the round-trip. +# --------------------------------------------------------------------------- # + + +def test_ingest_values_scalars(benchmark, con, scalars_wide): + con.values(scalars_wide).fetchall() # warm + benchmark(lambda: con.values(scalars_wide).fetchall()) + + +def test_ingest_values_tuples(benchmark, con, tuples_wide): + # each tuple cell -> LIST value (TransformPythonValue recursion) + con.values(tuples_wide).fetchall() # warm + benchmark(lambda: con.values(tuples_wide).fetchall()) + + +def test_ingest_values_dicts(benchmark, con, dicts_wide): + # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion) + con.values(dicts_wide).fetchall() # warm + benchmark(lambda: con.values(dicts_wide).fetchall()) diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py new file mode 100644 index 00000000..bb5fc1e8 --- /dev/null +++ b/benchmarks/test_ingest_numpy_perf.py @@ -0,0 +1,109 @@ +"""Standalone CodSpeed benchmark module for the NUMPY ingest paths (numpy / numpy-backed pandas -> duckdb) +— NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-codspeed + $M -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + $C -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + +WHY THIS MODULE: the numpy scan (NumpyScan / NumpyArray facade / RawArrayWrapper / pandas-bind / analyzer) is +the IN-numpy half the nanobind cutover reworked, and several of its branches were untested: + * I0-2 object-string scan: the per-row isinstance + PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode + transcode ladder (numpy_scan.cpp). GOTCHA (encoded): a meaningful benchmark MUST mix ASCII + non-ASCII + + a null sentinel -- ASCII-only misses the transcode + null-detection ladder entirely. + * I0-1 double NaN->NULL loop (numpy_scan.cpp) -- the reworked float path. + * NULL-heavy masked scan: ScanNumpyMasked + ApplyMask (pandas nullable Int64). + * I1-3 analyzer bind: PandasAnalyzer::Analyze samples rows through the GetItemType ladder. This is a per-BIND + cost, independent of row count, so it is the ONE place count(*) is the correct consume (the cost is at bind, + not scan); every other READ here aggregates over real columns (sum/length) to force a full engine scan. + * I1-8 numpy ndarray / dict-of-arrays via the replacement scan (resolved from a module global). + +numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +""" + +import duckdb +import numpy as np +import pandas as pd +import pytest + +N = 500_000 +ANALYZER_N = 200_000 + +# Module-global for the replacement-scan-from-variable path (frame resolution finds f_globals reliably). +NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5} + +# Mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only). +_MIXED = ["ascii_value_", "café_", "naïve_", "日本語_", None] +_MIXED_STRINGS = [None if _MIXED[i % 5] is None else f"{_MIXED[i % 5]}{i}" for i in range(N)] + +# Mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind. +_MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)] + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +@pytest.fixture(scope="module") +def df_double_with_nan(): + a = np.arange(N, dtype="float64") * 1.5 + a[::10] = np.nan # real NaNs -> NaN->NULL conversion loop + return pd.DataFrame({"a": a}) + + +@pytest.fixture(scope="module") +def df_object_string_mixed(): + return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)}) + + +@pytest.fixture(scope="module") +def df_masked_int(): + # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask + arr = pd.array(np.arange(N), dtype="Int64") + arr[::10] = pd.NA + return pd.DataFrame({"a": arr}) + + +@pytest.fixture(scope="module") +def df_object_mixed_types(): + return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)}) + + +# --------------------------------------------------------------------------- # +# READ: numpy -> duckdb. Engine scans every value (sum/length force it). +# --------------------------------------------------------------------------- # + + +def test_read_numpy_dict_numeric(benchmark, con): + benchmark(lambda: con.sql("SELECT sum(a), sum(b) FROM NPDICT").fetchall()) + + +def test_read_numpy_double_with_nan(benchmark, con, df_double_with_nan): + con.register("t", df_double_with_nan) + benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) + + +def test_read_numpy_masked_int(benchmark, con, df_masked_int): + con.register("t", df_masked_int) + benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) + + +def test_read_numpy_object_string_mixed(benchmark, con, df_object_string_mixed): + con.register("t", df_object_string_mixed) + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) + + +# --------------------------------------------------------------------------- # +# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY -- the cost is at bind, not scan, so we +# must NOT force a scan (that would drown the per-bind analyzer signal). Re-binds the object column each call. +# --------------------------------------------------------------------------- # + + +def test_bind_analyzer_object(benchmark, con, df_object_mixed_types): + con.register("t", df_object_mixed_types) + con.execute("SELECT count(*) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall()) diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py new file mode 100644 index 00000000..34a0948d --- /dev/null +++ b/benchmarks/test_pandas_perf.py @@ -0,0 +1,133 @@ +"""Standalone CodSpeed benchmark module for the pandas read/write binding paths, comparing NUMPY-backed vs +ARROW-backed DataFrames — NOT integrated (not in pyproject, not in CI, not committed). Run under each build: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-cutover + $M -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts= + $C -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts= + +WHY BOTH BACKINGS: when duckdb scans a pandas DataFrame, the binding path depends on each column's backing: + * numpy-backed columns (dtype int64 / float64 / object) -> the NUMPY scan path (NumpyArray facade, + RawArrayWrapper, pandas/bind.cpp, analyzer.cpp) -- this is the path the nanobind cutover reworked + NON-TRIVIALLY, so it gets first-class coverage here. + * arrow-backed columns (pandas ArrowDtype, e.g. int64[pyarrow]) -> the ARROW scan path (near zero-copy). +On the WRITE side, duckdb's native pandas output (rel.df()) is NUMPY-backed; an arrow-backed pandas frame is +produced via duckdb-arrow + pyarrow.to_pandas(ArrowDtype) (pyarrow.to_pandas is identical on both builds, so +the A/B delta is still the duckdb binding). + +FULL CONSUME (same discipline as the arrow module): READ aggregates over the actual columns (sum/length, NOT +count(*) which is answered from metadata), and WRITE materializes the entire DataFrame. + +numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +""" + +import duckdb +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +N = 500_000 +WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)" +WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)" +_STRINGS = [f"str_value_{i}" for i in range(N)] + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +@pytest.fixture(scope="module") +def df_numpy_numeric(): + return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}) + + +@pytest.fixture(scope="module") +def df_numpy_string(): + # explicit object dtype -> classic numpy-backed object-string column (the reworked object/analyzer path) + return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)}) + + +@pytest.fixture(scope="module") +def df_arrow_numeric(): + return pd.DataFrame( + { + "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())), + "b": pd.array(np.arange(N) * 1.5, dtype=pd.ArrowDtype(pa.float64())), + } + ) + + +@pytest.fixture(scope="module") +def df_arrow_string(): + return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))}) + + +# --------------------------------------------------------------------------- # +# READ: pandas -> duckdb. Engine scans every value (sum/length force it). +# --------------------------------------------------------------------------- # + + +def test_read_pandas_numpy_numeric(benchmark, con, df_numpy_numeric): + con.register("t", df_numpy_numeric) + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) + + +def test_read_pandas_numpy_string(benchmark, con, df_numpy_string): + con.register("t", df_numpy_string) + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) + + +def test_read_pandas_arrow_numeric(benchmark, con, df_arrow_numeric): + con.register("t", df_arrow_numeric) + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) + + +def test_read_pandas_arrow_string(benchmark, con, df_arrow_string): + con.register("t", df_arrow_string) + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) + + +# --------------------------------------------------------------------------- # +# WRITE: duckdb -> pandas. df() is NUMPY-backed (the reworked production path); +# the arrow-backed frame goes via duckdb-arrow + pyarrow.to_pandas(ArrowDtype). +# Both eagerly materialize the whole DataFrame. +# --------------------------------------------------------------------------- # + + +def test_write_pandas_numpy_numeric(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_NUM).df()) + + +def test_write_pandas_numpy_string(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_STR).df()) + + +# ADDED: the numpy-backed df() WRITE with REAL nulls -> the masked_array build + masked->pd.NA rewrite that the +# cutover reworked (a no-null column takes the cheap std::move path and would measure the wrong thing), plus a +# datetime column (TimestampConvert + ConvertDateTimeTypes). + + +def test_write_pandas_numpy_numeric_with_nulls(benchmark, con): + q = ( + "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " + "CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range(500000) t(i)" + ) + benchmark(lambda: con.sql(q).df()) + + +def test_write_pandas_numpy_timestamp(benchmark, con): + q = "SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range(500000) t(i)" + benchmark(lambda: con.sql(q).df()) + + +def test_write_pandas_arrow_numeric(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) + + +def test_write_pandas_arrow_string(benchmark, con): + benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py new file mode 100644 index 00000000..cfe5c281 --- /dev/null +++ b/benchmarks/test_produce_numpy_perf.py @@ -0,0 +1,181 @@ +"""Standalone CodSpeed benchmark module for the COLUMNAR produce paths (duckdb -> numpy/pandas), i.e. df(), +fetchnumpy(), fetch_df_chunk() — NOT integrated (not in pyproject, not in CI, not committed). Run under each +build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-codspeed + $M -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + $C -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + +WHY THIS MODULE: the columnar OUT path (FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular) is exactly +what the nanobind cutover reworked. The under-covered cases are: (1) the WITH-NULLS branch (HAS_NULLS=true -> +masked_array build -> masked->pd.NA rewrite, array_wrapper.cpp / pyresult.cpp) -- NEVER previously benchmarked +and the most-changed code; (2) datetime; (3) fetchnumpy without the DataFrame wrap; (4) fetch_df_chunk; and +the wide-internal types HUGEINT (->double cast), UUID (UUIDConvert), DECIMAL(28,x) (ConvertDecimalInternal +) that exercise distinct OUT-col converters. + +GOTCHA (encoded below): OUT-col NULL benchmarks use REAL DuckDB nulls (CASE WHEN .. THEN NULL). A no-null +column silently takes the cheap std::move path and the masked-array branch never triggers, so it would measure +the wrong thing. + +FULL CONSUME: df() / fetchnumpy() eagerly materialize the whole column set; fetch_df_chunk is drained in a loop. + +numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +""" + +import gc +import sys +import tracemalloc + +import duckdb +import numpy as np # noqa: F401 (pinned identically A/B; imported so the env matches the other modules) +import pytest + +N = 500_000 +TYPE_N = 200_000 # wide-internal types (hugeint/uuid/decimal128) are heavier per cell + +Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" +Q_NUM_NULLS = ( + "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " + f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)" +) +Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" +Q_TS = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)" +Q_HUGEINT = f"SELECT (i::HUGEINT * 1000000000000) AS h FROM range({TYPE_N}) t(i)" +Q_UUID = f"SELECT gen_random_uuid() AS u FROM range({TYPE_N}) t(i)" +Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)" + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +def _bench_df(benchmark, con, query): + con.sql(query).df() # warm + benchmark(lambda: con.sql(query).df()) + + +def _bench_numpy(benchmark, con, query): + con.sql(query).fetchnumpy() # warm + benchmark(lambda: con.sql(query).fetchnumpy()) + + +# --------------------------------------------------------------------------- # +# df(): the production NUMPY-backed columnar path. no-null vs REAL-null vs string vs timestamp. +# --------------------------------------------------------------------------- # + + +def test_df_numeric(benchmark, con): + _bench_df(benchmark, con, Q_NUM) + + +def test_df_numeric_with_nulls(benchmark, con): + # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch) + _bench_df(benchmark, con, Q_NUM_NULLS) + + +def test_df_string(benchmark, con): + _bench_df(benchmark, con, Q_STR) + + +def test_df_timestamp(benchmark, con): + _bench_df(benchmark, con, Q_TS) + + +def test_df_hugeint(benchmark, con): + _bench_df(benchmark, con, Q_HUGEINT) + + +def test_df_uuid(benchmark, con): + _bench_df(benchmark, con, Q_UUID) + + +def test_df_decimal128(benchmark, con): + _bench_df(benchmark, con, Q_DEC128) + + +# --------------------------------------------------------------------------- # +# fetchnumpy(): same FetchNumpyInternal without the DataFrame wrap. +# --------------------------------------------------------------------------- # + + +def test_fetchnumpy_numeric(benchmark, con): + _bench_numpy(benchmark, con, Q_NUM) + + +def test_fetchnumpy_numeric_with_nulls(benchmark, con): + _bench_numpy(benchmark, con, Q_NUM_NULLS) + + +# --------------------------------------------------------------------------- # +# fetch_df_chunk(): per-chunk DataFrame production, drained in a loop. +# --------------------------------------------------------------------------- # + + +def test_fetch_df_chunk_loop(benchmark, con): + def run(): + rel = con.sql(Q_NUM) + rows = 0 + while True: + chunk = rel.fetch_df_chunk() + if len(chunk) == 0: + break + rows += len(chunk) + return rows + + con.sql(Q_NUM).fetch_df_chunk() # warm + benchmark(run) + + +# --------------------------------------------------------------------------- # +# torch(): FetchNumpyInternal + per-column from_numpy. SKIPPED cleanly if torch is absent (identical A/B). +# --------------------------------------------------------------------------- # + + +def test_torch_numeric(benchmark, con): + pytest.importorskip("torch") + q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)" + con.sql(q).torch() # warm + benchmark(lambda: con.sql(q).torch()) + + +# --------------------------------------------------------------------------- # +# MEMORY GUARD (secondary signal, not a codspeed benchmark). codspeed walltime tracks neither memory nor +# allocations, and conversion regressions are often memory-shaped (the recorded fetchall list->tuple edge-copy; +# the df() masked_array branch). We use tracemalloc to capture the PEAK Python-tracked allocation of ONE +# df()-with-nulls call. Correctness notes: +# * reset_peak() is called AFTER the warm (and after freeing the warm result) so the warm does not establish +# a high-water mark that swallows the measured call -- the prior getrusage(ru_maxrss) version was broken +# precisely because ru_maxrss is monotonic and the warm pre-set the peak, making the delta ~0. +# * tracemalloc reports BYTES on every platform (no macOS-bytes / Linux-KiB skew that the getrusage version +# had), so the ceiling is portable to the Linux CI target. +# CAVEAT: tracemalloc only sees Python-level allocations; the raw numpy column buffers are allocated in C and +# are NOT visible here. So this catches a gross PYTHON-object-shaped blowup (the masked->pd.NA rewrite / a +# per-row object materialization regression) but is not a total-RSS gate -- the authoritative CI gate for the +# C-buffer payload is codspeed memory mode (--codspeed-mode=memory). +# --------------------------------------------------------------------------- # + + +def test_mem_df_with_nulls(): + con = duckdb.connect() + try: + tracemalloc.start() + warm = con.sql(Q_NUM_NULLS).df() # populate one-time import / type caches + del warm + gc.collect() + tracemalloc.reset_peak() # discount the warm's transient peak BEFORE the measured call + out = con.sql(Q_NUM_NULLS).df() + _current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + del out + finally: + con.close() + print(f"\n[mem] df()-with-nulls tracemalloc peak = {peak / 1e6:.1f} MB", file=sys.stderr) + # Python-tracked allocations for a 500k x 2-col masked df are a few MB; a gross conversion-memory blowup + # (e.g. a per-row Python object list, the masked->pd.NA rewrite gone wrong) is tens+ MB. 100 MB ceiling + # catches that without flaking, and is bytes on all platforms. + assert peak < 100_000_000 diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py new file mode 100644 index 00000000..3e92f12d --- /dev/null +++ b/benchmarks/test_types_roundtrip_perf.py @@ -0,0 +1,78 @@ +"""Standalone CodSpeed benchmark module: the TYPE x DIRECTION produce matrix — NOT integrated (not in +pyproject, not in CI, not committed). Run under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-codspeed + $M -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + $C -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + +WHY THIS MODULE: a single systematic sweep of one logical type per column across the three produce directions + * OUT-row = fetchall() -> FromValue per cell (python_objects.cpp) + * OUT-col = df() -> ArrayWrapper / ConvertColumn (array_wrapper.cpp) + * OUT-arrow = to_arrow_table() -> arrow export converters +so a regression localizes to (type, direction). Includes the iqmo/bareduckdb cross-check breadth that the +narrow-numeric homogeneous benchmarks miss: HUGEINT (PyLong_FromString / hugeint->double / int128 export), +UUID (uuid.UUID per row / UUIDConvert), DECIMAL(28,6) int128-internal (ConvertDecimalInternal), +and a long-varchar (>64 chars) that shifts the string paths from overhead-bound to copy-bound. + +FULL CONSUME: fetchall and df materialize everything; to_arrow_table is eager. NOTE: to_arrow_table on a +materialized result re-runs the query with the GIL released (PromoteMaterializedToArrow), so the OUT-arrow +column is engine-parallel and walltime-NOISY -- treat it as informational, not a hard gate. + +numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +""" + +import duckdb +import pytest + +N = 100_000 + +# one logical type per column; long-varchar is intentionally > 64 chars +TYPE_EXPR = { + "int64": "i::BIGINT", + "double": "(i * 1.5)::DOUBLE", + "varchar_short": "('str_' || i)", + "varchar_long": "('row_' || i || '_' || repeat('payload ', 9))", + "timestamp": "TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND)", + "decimal64": "((i::DECIMAL(18, 3)) / 1000)", + "decimal128": "((i * 1.5)::DECIMAL(28, 6))", + "hugeint": "(i::HUGEINT * 1000000000000)", + "uuid": "gen_random_uuid()", + "struct": "{'a': i, 'b': i + 1}", + "list": "[i, i + 1, i + 2]", +} +TYPES = list(TYPE_EXPR) + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +def _query(type_name): + return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)" + + +@pytest.mark.parametrize("type_name", TYPES) +def test_out_row_fetchall(benchmark, con, type_name): + q = _query(type_name) + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + + +@pytest.mark.parametrize("type_name", TYPES) +def test_out_col_df(benchmark, con, type_name): + q = _query(type_name) + con.sql(q).df() # warm + benchmark(lambda: con.sql(q).df()) + + +@pytest.mark.parametrize("type_name", TYPES) +def test_out_arrow_table(benchmark, con, type_name): + # informational only: PromoteMaterializedToArrow re-runs the query with the GIL released (noisy) + q = _query(type_name) + con.sql(q).to_arrow_table() # warm + benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py new file mode 100644 index 00000000..ef398ebb --- /dev/null +++ b/benchmarks/test_udf_perf.py @@ -0,0 +1,110 @@ +"""Standalone CodSpeed benchmark module for the Python UDF binding paths (src/python_udf.cpp) — NOT integrated +(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: + + M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python + C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python + cd /Users/evert/projects/duckdb-python/wt-codspeed + $M -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + $C -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + +WHY THIS MODULE: the whole UDF subsystem had ZERO benchmark coverage. The NATIVE scalar UDF is the single +biggest untested per-call-overhead path in the binding -- per row it builds a TupleBuilder of args, calls +PyObject_CallObject, and runs TransformPythonObject on the result (python_udf.cpp). The ARROW (vectorized) UDF +is the columnar counterpart: ConvertDataChunkToPyArrowTable + the Python call + ConvertArrowTableToVector cast. + +FULL CONSUME (same discipline as the other modules): every UDF benchmark wraps the call in a sum()/length() +aggregate so the ENGINE evaluates the UDF on every row (count(*) would skip it). The aggregate output is a +single row, so the measured cost is the per-row (native) / per-chunk (arrow) UDF invocation, not the fetch. + +numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +""" + +import duckdb +import pytest +from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR + +pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") + +NATIVE_N = 200_000 # native = one Python call per row, keep moderate +ARROW_N = 1_000_000 # arrow = one Python call per chunk (vectorized), can be large + + +@pytest.fixture +def con(): + c = duckdb.connect() + yield c + c.close() + + +def _bench(benchmark, con, query): + con.execute(query).fetchall() # warm the engine + import caches before measuring + benchmark(lambda: con.execute(query).fetchall()) + + +# --------------------------------------------------------------------------- # +# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result). +# --------------------------------------------------------------------------- # + + +def test_udf_native_int_1arg(benchmark, con): + con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) + _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)") + + +def test_udf_native_int_2arg(benchmark, con): + con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT) + _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)") + + +def test_udf_native_double_1arg(benchmark, con): + con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE) + _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)") + + +def test_udf_native_string(benchmark, con): + con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR) + _bench( + benchmark, + con, + f"SELECT sum(length(up(s))) FROM (SELECT ('str_value_' || i) AS s FROM range({NATIVE_N}) t(i))", + ) + + +def test_udf_native_null_inputs(benchmark, con): + # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the + # validity short-circuit, not the Python call, so the UDF only ever sees non-NULL rows. + con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) + _bench( + benchmark, + con, + "SELECT sum(add_one(v)) FROM " + f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({NATIVE_N}) t(i))", + ) + + +# --------------------------------------------------------------------------- # +# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast. +# --------------------------------------------------------------------------- # + + +def test_udf_arrow_int(benchmark, con): + con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)") + + +def test_udf_arrow_double(benchmark, con): + con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow") + _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)") + + +def test_udf_arrow_null_inputs(benchmark, con): + # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call + # and reconstructs the result vector afterwards -- this is the selvec compaction/reconstruction cost. + con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + _bench( + benchmark, + con, + "SELECT sum(arrow_add_one(v)) FROM " + f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({ARROW_N}) t(i))", + ) From 177d99b2f9a4593a42845904a9632f3419555128 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Wed, 1 Jul 2026 08:04:26 +0200 Subject: [PATCH 2/7] Redesign cardinality benchmark + add tokenless CodSpeed CI Cardinality: the ORDER BY ... LIMIT n sweep let the engine top-N sort dominate and swamp the per-row conversion signal (numbers were non-monotone). Replace it with a pre-materialized fixed source + plain LIMIT n (no sort): the scan early-stops at n rows, so rows-to-Python conversion is the dominant n-varying cost and the slope is monotone; the A/B delta at each n isolates the binding. CI: .github/workflows/codspeed.yml runs the suite under CodSpeed simulation (instruction-count) mode on Linux, tokenless (no dashboard upload; enable the hosted gate later via a CodSpeed project + OIDC/token). Instruction counts are deterministic for every benchmark, so no gated/informational split is needed. Not yet run in CI; the build steps mirror the dev build and need a shakeout. --- .github/workflows/codspeed.yml | 74 +++++++++++++++++++++++++++++ benchmarks/test_cardinality_perf.py | 49 ++++++++++--------- 2 files changed, 100 insertions(+), 23 deletions(-) create mode 100644 .github/workflows/codspeed.yml diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml new file mode 100644 index 00000000..fcd6b7ca --- /dev/null +++ b/.github/workflows/codspeed.yml @@ -0,0 +1,74 @@ +# Performance-regression benchmarks via CodSpeed, in deterministic instruction-count (simulation) mode. +# +# TOKENLESS FOR NOW: the CodSpeed action's token is only needed to UPLOAD results to the CodSpeed +# dashboard (the hosted PR-gate). Without it the action still RUNS every benchmark under Valgrind and +# reports the instruction counts in the job log. To turn on the hosted regression gate later: create a +# CodSpeed project for the repo and either (public repo) rely on the OIDC `id-token: write` permission +# below, or add a `CODSPEED_TOKEN` repo secret and pass `token: ${{ secrets.CODSPEED_TOKEN }}` to the action. +# +# Why simulation (instruction-count) and not walltime: instruction counts are deterministic even for the +# multi-threaded / engine-heavy paths (Valgrind serializes and counts), so the whole suite is gate-able and +# there is no need to split "gated vs informational" the way noisy local walltime required. Instruction count +# is exactly the signal that would have caught the LIST/ARRAY df() regression cleanly. +# +# NOTE: this workflow has not been run in CI yet; the build steps mirror the documented dev build (CLAUDE.md) +# and will likely need a shakeout run. Valgrind is slow (~20-50x); if the full suite is too slow, trim the +# largest-N benchmarks or run a curated subset. + +name: Benchmarks + +on: + pull_request: + push: + branches: [main] + workflow_dispatch: + +concurrency: + group: codspeed-${{ github.ref }} + cancel-in-progress: true + +jobs: + benchmarks: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write # enables tokenless (OIDC) upload once a CodSpeed project is linked; harmless otherwise + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive # the DuckDB engine submodule is needed to build + fetch-depth: 0 # setuptools_scm needs history for version detection + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.13" + + - name: Cache sccache + uses: actions/cache@v4 + with: + path: ~/.cache/sccache + key: sccache-codspeed-${{ hashFiles('external/duckdb') }} + restore-keys: sccache-codspeed- + + - name: Install sccache + run: | + curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \ + | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache + + - name: Build the extension (release) + benchmark deps + env: + CMAKE_C_COMPILER_LAUNCHER: sccache + CMAKE_CXX_COMPILER_LAUNCHER: sccache + run: | + uv sync --only-group build --no-install-project -p 3.13 + uv sync --no-build-isolation --no-editable --reinstall -p 3.13 + # benchmark deps: keep these pinned in lockstep with any baseline you compare against, so the only + # cross-run delta is the binding (numpy/pandas/pyarrow/polars/pytz + the codspeed plugin). + uv pip install pytest pytest-codspeed numpy pandas pyarrow polars pytz + + - name: Run benchmarks (instruction-count) + uses: CodSpeedHQ/action@v4 + with: + mode: simulation + run: uv run pytest benchmarks/ --codspeed -o addopts= -p no:cacheprovider diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py index d4edda03..3fe4ee0d 100644 --- a/benchmarks/test_cardinality_perf.py +++ b/benchmarks/test_cardinality_perf.py @@ -1,5 +1,4 @@ -"""Standalone CodSpeed benchmark module: the RESULT-CARDINALITY (top-N) sweep — NOT integrated (not in -pyproject, not in CI, not committed). Run under each build's interpreter and compare: +"""Standalone CodSpeed benchmark: the RESULT-CARDINALITY (rows-to-Python) sweep. Run under each build: M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python @@ -7,41 +6,45 @@ $M -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider $C -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider -WHY THIS MODULE (adopted from iqmo-org/bareduckdb): hold the SOURCE fixed and sweep only the number of rows -materialized to Python via ORDER BY ... LIMIT n for n in {100, 1k, 10k, 100k}, through fetchall / df / -to_arrow_table. The engine cost (scan the fixed SRC + top-N heap) stays ~constant, so the walltime delta -across n is dominated by the per-row binding conversion -> a clean per-row slope. The n=100 end is the -noise-free overhead regime (the natural instruction-count-gate point); the n=100k end is throughput. - -A clean monotone slope (and ~parity slope between the two builds) is the signal we report; a build whose slope -is steeper has a per-row conversion regression. Source held constant rules out scan-cost as the confound (a -cleaner axis than varying range(), which also changes scan cost). - -numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +REDESIGN NOTE: the first version swept `ORDER BY a DESC LIMIT n` over a fixed source. That was wrong: +the engine's full top-N SORT (~3-14ms, itself variable) dominated and swamped the per-row conversion +signal, and the numbers came out non-monotone. This version pre-materializes the fixed source table ONCE +and sweeps `SELECT * FROM src LIMIT n` with NO ORDER BY: a plain LIMIT early-stops the scan at n rows, so +the engine cost is cheap and monotone in n, and the rows-to-Python CONVERSION is the dominant n-varying +cost. That gives a clean, monotone per-row slope; the A/B delta at each n isolates the binding, and a build +whose slope is steeper has a per-row conversion regression. n=100 is the overhead regime (the natural +instruction-count-gate point); n=100_000 is throughput. + +3 columns (BIGINT, DOUBLE, VARCHAR) so per-row conversion is non-trivial. numpy/pandas/pyarrow are pinned to +the SAME versions in both .venv-release, so the A/B delta is purely the binding. """ import duckdb import pytest -SRC = 200_000 # fixed source size -> constant engine scan + top-N across all n +SRC_ROWS = 200_000 LIMITS = [100, 1_000, 10_000, 100_000] -# 3 columns (BIGINT, DOUBLE, VARCHAR) so the per-row conversion is non-trivial; source is a fixed inline -# subquery (no table state) and ORDER BY forces a full scan + top-N of the same SRC rows every time. -_SRC_SUBQ = f"(SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC}) t(i))" - - -def _query(n): - return f"SELECT a, b, s FROM {_SRC_SUBQ} ORDER BY a DESC LIMIT {n}" - -@pytest.fixture +@pytest.fixture(scope="module") def con(): + # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be + # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows. c = duckdb.connect() + c.execute( + "CREATE TABLE src AS " + f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)" + ) yield c c.close() +def _query(n): + # No ORDER BY: a plain LIMIT early-stops the scan at n rows -> engine cost cheap and monotone in n, so the + # per-row binding conversion dominates the n-varying signal (unlike the old ORDER BY top-N sort). + return f"SELECT a, b, s FROM src LIMIT {n}" + + @pytest.mark.parametrize("n", LIMITS) def test_limit_fetchall(benchmark, con, n): q = _query(n) From 4f24ed7371e202fed509c741041c9f77eb4aba6c Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Wed, 1 Jul 2026 13:01:05 +0200 Subject: [PATCH 3/7] fix benchmarks and add workflow --- .github/workflows/codspeed.yml | 21 ++--- benchmarks/test_arrow_perf.py | 88 +++++++++++++-------- benchmarks/test_cardinality_perf.py | 54 +++++++------ benchmarks/test_fetch_perf.py | 98 ++++++++++++++--------- benchmarks/test_ingest_native_perf.py | 76 +++++++++++------- benchmarks/test_ingest_numpy_perf.py | 81 +++++++++++-------- benchmarks/test_pandas_perf.py | 101 +++++++++++++++--------- benchmarks/test_produce_numpy_perf.py | 89 ++++++++++++--------- benchmarks/test_types_roundtrip_perf.py | 58 +++++++------- benchmarks/test_udf_perf.py | 63 +++++++++------ pyproject.toml | 1 + 11 files changed, 431 insertions(+), 299 deletions(-) diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index fcd6b7ca..c82465d2 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -1,19 +1,12 @@ -# Performance-regression benchmarks via CodSpeed, in deterministic instruction-count (simulation) mode. +# Perf-regression benchmarks via CodSpeed in instruction-count (simulation) mode: deterministic, so the whole +# suite is gate-able (no walltime noise, no gated/informational split). # -# TOKENLESS FOR NOW: the CodSpeed action's token is only needed to UPLOAD results to the CodSpeed -# dashboard (the hosted PR-gate). Without it the action still RUNS every benchmark under Valgrind and -# reports the instruction counts in the job log. To turn on the hosted regression gate later: create a -# CodSpeed project for the repo and either (public repo) rely on the OIDC `id-token: write` permission -# below, or add a `CODSPEED_TOKEN` repo secret and pass `token: ${{ secrets.CODSPEED_TOKEN }}` to the action. +# TOKENLESS: the token is only for uploading to the CodSpeed dashboard. Without it the action still runs every +# benchmark and reports counts in the job log. For the hosted gate later, create a CodSpeed project and rely on +# the OIDC id-token permission below (public repo), or add a CODSPEED_TOKEN secret and pass token: to the action. # -# Why simulation (instruction-count) and not walltime: instruction counts are deterministic even for the -# multi-threaded / engine-heavy paths (Valgrind serializes and counts), so the whole suite is gate-able and -# there is no need to split "gated vs informational" the way noisy local walltime required. Instruction count -# is exactly the signal that would have caught the LIST/ARRAY df() regression cleanly. -# -# NOTE: this workflow has not been run in CI yet; the build steps mirror the documented dev build (CLAUDE.md) -# and will likely need a shakeout run. Valgrind is slow (~20-50x); if the full suite is too slow, trim the -# largest-N benchmarks or run a curated subset. +# Not yet run in CI; the build mirrors the dev build (CLAUDE.md) and will need a shakeout. Valgrind is slow +# (~20-50x); trim the largest-N benchmarks if the suite is too slow. name: Benchmarks diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py index e6fc43e0..244663bc 100644 --- a/benchmarks/test_arrow_perf.py +++ b/benchmarks/test_arrow_perf.py @@ -1,41 +1,46 @@ -"""Standalone CodSpeed benchmark module for the Arrow read/write binding paths — NOT integrated -(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: - - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python - cd /Users/evert/projects/duckdb-python/wt-cutover - $M -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts= - $C -m pytest benchmarks/test_arrow_perf.py --codspeed --codspeed-mode=walltime -o addopts= - -DESIGN — the data must be FULLY MOVED, not lazily wrapped, or the benchmark measures nothing: - * READ (arrow -> duckdb): the duckdb ENGINE must scan every value. We aggregate over the actual - columns (sum/length), NOT count(*) -- count(*) is answered from arrow metadata without touching data. - * WRITE (duckdb -> arrow): the CONSUMER must materialize everything. - - to_arrow_table() / pl() are EAGER (the full table / polars DataFrame is built). - - to_arrow_reader() is LAZY -- duckdb only produces a batch when it is pulled -- so we iterate the - whole stream to actually exercise and consume the write path. - -pyarrow/polars are pinned to the SAME version in both .venv-release, so the A/B delta is purely the binding. +"""CodSpeed benchmark: Arrow read/write paths. Standalone, not in CI. + +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): + cd /Users/evert/projects/duckdb-python/wt-codspeed + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_arrow_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +Data must be fully moved or nothing is measured: READ aggregates over real columns (sum/length, not count(*), +which arrow answers from metadata); WRITE materializes the result (to_arrow_reader is lazy, so it is drained). """ -import duckdb +from __future__ import annotations + +from typing import TYPE_CHECKING + import pyarrow as pa import pytest +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture + N = 500_000 WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)" WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)" @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() @pytest.fixture(scope="module") -def arrow_numeric(): +def arrow_numeric() -> pa.Table: + """Return a two-column numeric arrow table.""" return pa.table( { "a": pa.array(range(N), type=pa.int64()), @@ -45,12 +50,14 @@ def arrow_numeric(): @pytest.fixture(scope="module") -def arrow_string(): +def arrow_string() -> pa.Table: + """Return a single-column string arrow table.""" return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())}) @pytest.fixture(scope="module") -def arrow_numeric_batches(arrow_numeric): +def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.RecordBatch]]: + """Return the schema and record batches for the numeric table.""" # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000) @@ -60,12 +67,16 @@ def arrow_numeric_batches(arrow_numeric): # --------------------------------------------------------------------------- # -def test_read_arrow_numeric(benchmark, con, arrow_numeric): +def test_read_arrow_numeric( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric: pa.Table +) -> None: + """Benchmark scanning a numeric arrow table.""" con.register("t_num", arrow_numeric) benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()) -def test_read_arrow_string(benchmark, con, arrow_string): +def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None: + """Benchmark scanning a string arrow table.""" con.register("t_str", arrow_string) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()) @@ -75,10 +86,15 @@ def test_read_arrow_string(benchmark, con, arrow_string): # full scan of every value. -def test_read_arrow_reader_numeric(benchmark, con, arrow_numeric_batches): +def test_read_arrow_reader_numeric( + benchmark: BenchmarkFixture, + con: duckdb.DuckDBPyConnection, + arrow_numeric_batches: tuple[pa.Schema, list[pa.RecordBatch]], +) -> None: + """Benchmark scanning a streaming record-batch reader.""" schema, batches = arrow_numeric_batches - def run(): + def run() -> list: reader = pa.RecordBatchReader.from_batches(schema, iter(batches)) con.register("t_rdr", reader) return con.execute("SELECT sum(a), sum(b) FROM t_rdr").fetchall() @@ -92,16 +108,20 @@ def run(): # --------------------------------------------------------------------------- # -def test_write_arrow_table_numeric(benchmark, con): +def test_write_arrow_table_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a numeric result to an arrow table.""" benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table()) -def test_write_arrow_table_string(benchmark, con): +def test_write_arrow_table_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a string result to an arrow table.""" benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table()) -def test_write_arrow_reader_consumed(benchmark, con): - def run(): +def test_write_arrow_reader_consumed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark draining a lazy arrow record-batch reader.""" + + def run() -> int: reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000) rows = 0 for batch in reader: # drain the lazy stream so duckdb actually produces every batch @@ -111,9 +131,11 @@ def run(): benchmark(run) -def test_write_polars_numeric(benchmark, con): +def test_write_polars_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a numeric result to a polars frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).pl()) -def test_write_polars_string(benchmark, con): +def test_write_polars_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a string result to a polars frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).pl()) diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py index 3fe4ee0d..bf49dfc1 100644 --- a/benchmarks/test_cardinality_perf.py +++ b/benchmarks/test_cardinality_perf.py @@ -1,33 +1,38 @@ -"""Standalone CodSpeed benchmark: the RESULT-CARDINALITY (rows-to-Python) sweep. Run under each build: +"""CodSpeed benchmark: the result-cardinality (rows-to-Python) sweep. Standalone, not in CI. - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): cd /Users/evert/projects/duckdb-python/wt-codspeed - $M -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - $C -m pytest benchmarks/test_cardinality_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - -REDESIGN NOTE: the first version swept `ORDER BY a DESC LIMIT n` over a fixed source. That was wrong: -the engine's full top-N SORT (~3-14ms, itself variable) dominated and swamped the per-row conversion -signal, and the numbers came out non-monotone. This version pre-materializes the fixed source table ONCE -and sweeps `SELECT * FROM src LIMIT n` with NO ORDER BY: a plain LIMIT early-stops the scan at n rows, so -the engine cost is cheap and monotone in n, and the rows-to-Python CONVERSION is the dominant n-varying -cost. That gives a clean, monotone per-row slope; the A/B delta at each n isolates the binding, and a build -whose slope is steeper has a per-row conversion regression. n=100 is the overhead regime (the natural -instruction-count-gate point); n=100_000 is throughput. - -3 columns (BIGINT, DOUBLE, VARCHAR) so per-row conversion is non-trivial. numpy/pandas/pyarrow are pinned to -the SAME versions in both .venv-release, so the A/B delta is purely the binding. + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_cardinality_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +Sweeps `SELECT * FROM src LIMIT n` (no ORDER BY) over a pre-materialized 3-column source: a plain LIMIT +early-stops the scan, so the per-row conversion dominates and the slope is monotone in n. A steeper slope on +one build is a per-row conversion regression. n=100 is the overhead regime, n=100_000 is throughput. +(An earlier ORDER BY version was dropped: the top-N sort swamped the signal.) """ -import duckdb +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture + SRC_ROWS = 200_000 LIMITS = [100, 1_000, 10_000, 100_000] @pytest.fixture(scope="module") -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a connection over a once-materialized source table.""" # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows. c = duckdb.connect() @@ -39,28 +44,31 @@ def con(): c.close() -def _query(n): +def _query(n: int) -> str: # No ORDER BY: a plain LIMIT early-stops the scan at n rows -> engine cost cheap and monotone in n, so the # per-row binding conversion dominates the n-varying signal (unlike the old ORDER BY top-N sort). return f"SELECT a, b, s FROM src LIMIT {n}" @pytest.mark.parametrize("n", LIMITS) -def test_limit_fetchall(benchmark, con, n): +def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: + """Benchmark fetchall over a LIMIT n sweep.""" q = _query(n) con.execute(q).fetchall() # warm benchmark(lambda: con.execute(q).fetchall()) @pytest.mark.parametrize("n", LIMITS) -def test_limit_df(benchmark, con, n): +def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: + """Benchmark df() over a LIMIT n sweep.""" q = _query(n) con.sql(q).df() # warm benchmark(lambda: con.sql(q).df()) @pytest.mark.parametrize("n", LIMITS) -def test_limit_to_arrow(benchmark, con, n): +def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: + """Benchmark to_arrow_table() over a LIMIT n sweep.""" q = _query(n) con.sql(q).to_arrow_table() # warm benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py index 8c8ef20a..94a53c30 100644 --- a/benchmarks/test_fetch_perf.py +++ b/benchmarks/test_fetch_perf.py @@ -1,57 +1,70 @@ -"""Standalone CodSpeed benchmark module — NOT integrated (not in pyproject, not in CI, not committed). +"""CodSpeed benchmark: row fetch paths (fetchall, fetchone iteration, expression construction). Standalone, not in CI. -Purpose: A/B the binding-layer perf between the two builds (pybind11 `main` vs nanobind cutover), in particular -the narrow-column `fetchall` regression. Run the SAME file under each build's interpreter and compare: +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): + cd /Users/evert/projects/duckdb-python/wt-codspeed + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_fetch_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python - cd /Users/evert/projects/duckdb-python/wt-cutover - $M -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts= - $C -m pytest benchmarks/test_fetch_perf.py --codspeed --codspeed-mode=walltime -o addopts= - -NOTE: macOS arm64 has no Valgrind, so only `--codspeed-mode=walltime` works locally (wall-clock stats). The -deterministic instruction-count mode (`--codspeed-mode=simulation`) needs Linux + the CodSpeed instrument -(CI, or `codspeed run` in a Linux container). In CI/cloud, CodSpeed compares each run against a git baseline; -locally we get the same benchmark workflow but A/B by running the file under the two interpreters by hand. +Only walltime works locally (no Valgrind on macOS arm64); the deterministic instruction-count mode needs Linux (CI). +Walltime is noisy on sub-ms benchmarks. """ -import duckdb +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture + @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() -def _bench_fetchall(benchmark, con, query): +def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.execute(query).fetchall() # warm the engine before measuring benchmark(lambda: con.execute(query).fetchall()) -def test_fetchall_int(benchmark, con): +def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a single BIGINT column.""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(200000) t(i)") -def test_fetchall_smallint(benchmark, con): +def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a single INTEGER column.""" _bench_fetchall(benchmark, con, "SELECT (i % 100)::INTEGER AS a FROM range(200000) t(i)") -def test_fetchall_double(benchmark, con): +def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a single DOUBLE column.""" _bench_fetchall(benchmark, con, "SELECT (i * 1.5)::DOUBLE AS a FROM range(200000) t(i)") -def test_fetchall_2int(benchmark, con): +def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of two BIGINT columns.""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(200000) t(i)") -def test_fetchall_str(benchmark, con): +def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a single VARCHAR column.""" _bench_fetchall(benchmark, con, "SELECT ('str_value_' || i) AS s FROM range(100000) t(i)") -def test_fetchall_mixed(benchmark, con): +def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a mixed scalar/list/struct row.""" query = ( "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, " "{'a': i, 'b': i + 1} AS st FROM range(50000) t(i)" @@ -59,10 +72,11 @@ def test_fetchall_mixed(benchmark, con): _bench_fetchall(benchmark, con, query) -def test_fetchone_iter(benchmark, con): +def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark iterating a result one row at a time with fetchone.""" query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)" - def run(): + def run() -> None: rel = con.execute(query) while rel.fetchone() is not None: pass @@ -78,35 +92,40 @@ def run(): # --------------------------------------------------------------------------- # -def test_fetchall_int_gate(benchmark, con): +def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark the small-N BIGINT instruction-count gate.""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)") -def test_fetchall_2int_gate(benchmark, con): +def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark the small-N two-BIGINT instruction-count gate.""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)") -def test_fetchall_null_heavy(benchmark, con): - _bench_fetchall( - benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)" - ) +def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a half-NULL BIGINT column.""" + _bench_fetchall(benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)") -def test_fetchall_timestamptz(benchmark, con): +def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a TIMESTAMPTZ column.""" _bench_fetchall( benchmark, con, "SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range(100000) t(i)" ) -def test_fetchall_decimal128(benchmark, con): +def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a 128-bit DECIMAL column.""" _bench_fetchall(benchmark, con, "SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range(200000) t(i)") -def test_fetchall_blob(benchmark, con): +def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a BLOB column.""" _bench_fetchall(benchmark, con, "SELECT ('blob_value_' || i)::BLOB FROM range(100000) t(i)") -def test_fetchall_mixed_wide(benchmark, con): +def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchall of a heterogeneous wide-type row.""" # heterogeneous row -> per-cell type dispatch in the Fetchone column loop (distinct branch/cache profile # from the homogeneous single-type columns above) query = ( @@ -116,10 +135,11 @@ def test_fetchall_mixed_wide(benchmark, con): _bench_fetchall(benchmark, con, query) -def test_fetchmany_batched(benchmark, con): +def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark draining a result with batched fetchmany.""" query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)" - def run(): + def run() -> None: rel = con.execute(query) while True: rows = rel.fetchmany(10_000) @@ -129,8 +149,10 @@ def run(): benchmark(run) -def test_expr_many(benchmark): - def run(): +def test_expr_many(benchmark: BenchmarkFixture) -> None: + """Benchmark building many column/constant expressions.""" + + def run() -> int: out = [] for i in range(2000): col = duckdb.ColumnExpression(f"col_{i}") diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py index 4fca641a..e3f232cc 100644 --- a/benchmarks/test_ingest_native_perf.py +++ b/benchmarks/test_ingest_native_perf.py @@ -1,57 +1,63 @@ -"""Standalone CodSpeed benchmark module for NATIVE Python-object ingest (Python list/tuple/dict -> duckdb) — -NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: +"""CodSpeed benchmark: native Python-object ingest (list/tuple/dict -> duckdb). Standalone, not in CI. - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): cd /Users/evert/projects/duckdb-python/wt-codspeed - $M -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - $C -m pytest benchmarks/test_ingest_native_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_ingest_native_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +Every cell goes through TransformPythonValue; dicts recurse to STRUCT; executemany re-binds per row. Note: one +list arg to values() is ONE row whose columns are the list items, so a list of N items transforms N cells. +executemany writes to a real table (CREATE OR REPLACE each round so it doesn't grow across repeats). +""" -WHY THIS MODULE: native Python-object ingest had ZERO coverage. Every cell goes through TransformPythonValue -and the GetPythonObjectType ladder (python_conversion.cpp); dicts recurse through TransformDictionaryToStruct; -executemany re-binds a parameter set per row (pyconnection.cpp ExecuteMany loop). +from __future__ import annotations -FULL MATERIALIZE: executemany lands N rows in a real table (CREATE OR REPLACE each round so the table does not -grow across codspeed's repeated invocations). values() builds the value vectors EAGERLY inside the call -(TransformPythonParamList), and we drain the resulting relation with fetchall so the round-trip is complete. +from typing import TYPE_CHECKING -NOTE on values() shape: a single list argument to values() becomes ONE row whose COLUMNS are the list items -(see DuckDBPyConnection::Values, pyconnection.cpp) -- so a list of N scalars is 1 row x N columns and runs -TransformPythonValue N times; a list of N tuples is 1 row x N nested(LIST) columns; a list of N dicts is -1 row x N STRUCT columns (TransformDictionaryToStruct). All three exercise the per-cell transform N times. -""" +import pytest import duckdb -import pytest + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture EXECMANY_N = 20_000 # executemany re-binds + executes per row, keep moderate WIDE_N = 10_000 # values() builds a 1-row x N-col relation; cap N so the binder stays sane @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() @pytest.fixture(scope="module") -def rows_3col(): +def rows_3col() -> list[tuple[int, float, str]]: + """Return parameter rows for a 3-column executemany.""" return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)] @pytest.fixture(scope="module") -def scalars_wide(): - return [i for i in range(WIDE_N)] +def scalars_wide() -> list[int]: + """Return a wide row of scalar ints for values().""" + return list(range(WIDE_N)) @pytest.fixture(scope="module") -def tuples_wide(): +def tuples_wide() -> list[tuple[int, int, int]]: + """Return a wide row of tuples for values().""" return [(i, i + 1, i + 2) for i in range(WIDE_N)] @pytest.fixture(scope="module") -def dicts_wide(): +def dicts_wide() -> list[dict[str, int | str]]: + """Return a wide row of dicts for values().""" return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)] @@ -60,11 +66,14 @@ def dicts_wide(): # --------------------------------------------------------------------------- # -def test_ingest_executemany_3col(benchmark, con, rows_3col): +def test_ingest_executemany_3col( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, rows_3col: list[tuple[int, float, str]] +) -> None: + """Benchmark executemany INSERT of 3-column rows.""" con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) # warm - def run(): + def run() -> None: con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) @@ -76,18 +85,27 @@ def run(): # --------------------------------------------------------------------------- # -def test_ingest_values_scalars(benchmark, con, scalars_wide): +def test_ingest_values_scalars( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, scalars_wide: list[int] +) -> None: + """Benchmark values() over a wide row of scalars.""" con.values(scalars_wide).fetchall() # warm benchmark(lambda: con.values(scalars_wide).fetchall()) -def test_ingest_values_tuples(benchmark, con, tuples_wide): +def test_ingest_values_tuples( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, tuples_wide: list[tuple[int, int, int]] +) -> None: + """Benchmark values() over a wide row of tuples.""" # each tuple cell -> LIST value (TransformPythonValue recursion) con.values(tuples_wide).fetchall() # warm benchmark(lambda: con.values(tuples_wide).fetchall()) -def test_ingest_values_dicts(benchmark, con, dicts_wide): +def test_ingest_values_dicts( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, dicts_wide: list[dict[str, int | str]] +) -> None: + """Benchmark values() over a wide row of dicts.""" # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion) con.values(dicts_wide).fetchall() # warm benchmark(lambda: con.values(dicts_wide).fetchall()) diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py index bb5fc1e8..abbe2a4d 100644 --- a/benchmarks/test_ingest_numpy_perf.py +++ b/benchmarks/test_ingest_numpy_perf.py @@ -1,31 +1,32 @@ -"""Standalone CodSpeed benchmark module for the NUMPY ingest paths (numpy / numpy-backed pandas -> duckdb) -— NOT integrated (not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: +"""CodSpeed benchmark: numpy ingest paths (numpy / numpy-backed pandas -> duckdb). Standalone, not in CI. - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): cd /Users/evert/projects/duckdb-python/wt-codspeed - $M -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - $C -m pytest benchmarks/test_ingest_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - -WHY THIS MODULE: the numpy scan (NumpyScan / NumpyArray facade / RawArrayWrapper / pandas-bind / analyzer) is -the IN-numpy half the nanobind cutover reworked, and several of its branches were untested: - * I0-2 object-string scan: the per-row isinstance + PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode - transcode ladder (numpy_scan.cpp). GOTCHA (encoded): a meaningful benchmark MUST mix ASCII + non-ASCII + - a null sentinel -- ASCII-only misses the transcode + null-detection ladder entirely. - * I0-1 double NaN->NULL loop (numpy_scan.cpp) -- the reworked float path. - * NULL-heavy masked scan: ScanNumpyMasked + ApplyMask (pandas nullable Int64). - * I1-3 analyzer bind: PandasAnalyzer::Analyze samples rows through the GetItemType ladder. This is a per-BIND - cost, independent of row count, so it is the ONE place count(*) is the correct consume (the cost is at bind, - not scan); every other READ here aggregates over real columns (sum/length) to force a full engine scan. - * I1-8 numpy ndarray / dict-of-arrays via the replacement scan (resolved from a module global). - -numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_ingest_numpy_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +Covers the object-string scan (ASCII zero-copy vs transcode ladder), the NaN->NULL float loop, the masked +scan, and analyzer bind. Gotchas: the object-string benchmark MUST mix ASCII + non-ASCII + a null or it misses +the ladder; analyzer bind is the one place count(*) is correct (cost is at bind, not scan) while every other +READ aggregates over real columns. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + import duckdb import numpy as np import pandas as pd -import pytest + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture N = 500_000 ANALYZER_N = 200_000 @@ -42,26 +43,30 @@ @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() @pytest.fixture(scope="module") -def df_double_with_nan(): +def df_double_with_nan() -> pd.DataFrame: + """Return a numpy-backed double frame with real NaNs.""" a = np.arange(N, dtype="float64") * 1.5 a[::10] = np.nan # real NaNs -> NaN->NULL conversion loop return pd.DataFrame({"a": a}) @pytest.fixture(scope="module") -def df_object_string_mixed(): +def df_object_string_mixed() -> pd.DataFrame: + """Return an object-string frame mixing ASCII, non-ASCII, and nulls.""" return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)}) @pytest.fixture(scope="module") -def df_masked_int(): +def df_masked_int() -> pd.DataFrame: + """Return a nullable-Int64 frame that scans masked.""" # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask arr = pd.array(np.arange(N), dtype="Int64") arr[::10] = pd.NA @@ -69,7 +74,8 @@ def df_masked_int(): @pytest.fixture(scope="module") -def df_object_mixed_types(): +def df_object_mixed_types() -> pd.DataFrame: + """Return an object frame of mixed python types for analyzer bind.""" return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)}) @@ -78,21 +84,31 @@ def df_object_mixed_types(): # --------------------------------------------------------------------------- # -def test_read_numpy_dict_numeric(benchmark, con): +def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark scanning a numpy-dict replacement scan.""" benchmark(lambda: con.sql("SELECT sum(a), sum(b) FROM NPDICT").fetchall()) -def test_read_numpy_double_with_nan(benchmark, con, df_double_with_nan): +def test_read_numpy_double_with_nan( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame +) -> None: + """Benchmark scanning a numpy double column with NaNs.""" con.register("t", df_double_with_nan) benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) -def test_read_numpy_masked_int(benchmark, con, df_masked_int): +def test_read_numpy_masked_int( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame +) -> None: + """Benchmark scanning a masked nullable-int column.""" con.register("t", df_masked_int) benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) -def test_read_numpy_object_string_mixed(benchmark, con, df_object_string_mixed): +def test_read_numpy_object_string_mixed( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame +) -> None: + """Benchmark scanning a mixed object-string column.""" con.register("t", df_object_string_mixed) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) @@ -103,7 +119,10 @@ def test_read_numpy_object_string_mixed(benchmark, con, df_object_string_mixed): # --------------------------------------------------------------------------- # -def test_bind_analyzer_object(benchmark, con, df_object_mixed_types): +def test_bind_analyzer_object( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame +) -> None: + """Benchmark the analyzer bind of a mixed-type object column.""" con.register("t", df_object_mixed_types) con.execute("SELECT count(*) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall()) diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py index 34a0948d..1a4c09f0 100644 --- a/benchmarks/test_pandas_perf.py +++ b/benchmarks/test_pandas_perf.py @@ -1,32 +1,32 @@ -"""Standalone CodSpeed benchmark module for the pandas read/write binding paths, comparing NUMPY-backed vs -ARROW-backed DataFrames — NOT integrated (not in pyproject, not in CI, not committed). Run under each build: - - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-cutover/.venv-release/bin/python - cd /Users/evert/projects/duckdb-python/wt-cutover - $M -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts= - $C -m pytest benchmarks/test_pandas_perf.py --codspeed --codspeed-mode=walltime -o addopts= - -WHY BOTH BACKINGS: when duckdb scans a pandas DataFrame, the binding path depends on each column's backing: - * numpy-backed columns (dtype int64 / float64 / object) -> the NUMPY scan path (NumpyArray facade, - RawArrayWrapper, pandas/bind.cpp, analyzer.cpp) -- this is the path the nanobind cutover reworked - NON-TRIVIALLY, so it gets first-class coverage here. - * arrow-backed columns (pandas ArrowDtype, e.g. int64[pyarrow]) -> the ARROW scan path (near zero-copy). -On the WRITE side, duckdb's native pandas output (rel.df()) is NUMPY-backed; an arrow-backed pandas frame is -produced via duckdb-arrow + pyarrow.to_pandas(ArrowDtype) (pyarrow.to_pandas is identical on both builds, so -the A/B delta is still the duckdb binding). - -FULL CONSUME (same discipline as the arrow module): READ aggregates over the actual columns (sum/length, NOT -count(*) which is answered from metadata), and WRITE materializes the entire DataFrame. - -numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. +"""CodSpeed benchmark: pandas read/write, numpy-backed vs arrow-backed DataFrames. Standalone, not in CI. + +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): + cd /Users/evert/projects/duckdb-python/wt-codspeed + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_pandas_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +The binding path depends on column backing: numpy-backed columns take the NumpyArray scan path, arrow-backed +(pandas ArrowDtype) take the near-zero-copy arrow path. Full consume: READ aggregates over real columns (not +count(*)), WRITE materializes the whole frame. """ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyarrow as pa +import pytest + import duckdb import numpy as np import pandas as pd -import pyarrow as pa -import pytest + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture N = 500_000 WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)" @@ -35,25 +35,29 @@ @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() @pytest.fixture(scope="module") -def df_numpy_numeric(): +def df_numpy_numeric() -> pd.DataFrame: + """Return a numpy-backed numeric frame.""" return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}) @pytest.fixture(scope="module") -def df_numpy_string(): +def df_numpy_string() -> pd.DataFrame: + """Return a numpy-backed object-string frame.""" # explicit object dtype -> classic numpy-backed object-string column (the reworked object/analyzer path) return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)}) @pytest.fixture(scope="module") -def df_arrow_numeric(): +def df_arrow_numeric() -> pd.DataFrame: + """Return an arrow-backed numeric frame.""" return pd.DataFrame( { "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())), @@ -63,7 +67,8 @@ def df_arrow_numeric(): @pytest.fixture(scope="module") -def df_arrow_string(): +def df_arrow_string() -> pd.DataFrame: + """Return an arrow-backed string frame.""" return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))}) @@ -72,22 +77,34 @@ def df_arrow_string(): # --------------------------------------------------------------------------- # -def test_read_pandas_numpy_numeric(benchmark, con, df_numpy_numeric): +def test_read_pandas_numpy_numeric( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame +) -> None: + """Benchmark scanning a numpy-backed numeric frame.""" con.register("t", df_numpy_numeric) benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) -def test_read_pandas_numpy_string(benchmark, con, df_numpy_string): +def test_read_pandas_numpy_string( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame +) -> None: + """Benchmark scanning a numpy-backed string frame.""" con.register("t", df_numpy_string) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) -def test_read_pandas_arrow_numeric(benchmark, con, df_arrow_numeric): +def test_read_pandas_arrow_numeric( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame +) -> None: + """Benchmark scanning an arrow-backed numeric frame.""" con.register("t", df_arrow_numeric) benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) -def test_read_pandas_arrow_string(benchmark, con, df_arrow_string): +def test_read_pandas_arrow_string( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame +) -> None: + """Benchmark scanning an arrow-backed string frame.""" con.register("t", df_arrow_string) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) @@ -99,11 +116,13 @@ def test_read_pandas_arrow_string(benchmark, con, df_arrow_string): # --------------------------------------------------------------------------- # -def test_write_pandas_numpy_numeric(benchmark, con): +def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a numeric result to a numpy-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).df()) -def test_write_pandas_numpy_string(benchmark, con): +def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a string result to a numpy-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).df()) @@ -112,7 +131,8 @@ def test_write_pandas_numpy_string(benchmark, con): # datetime column (TimestampConvert + ConvertDateTimeTypes). -def test_write_pandas_numpy_numeric_with_nulls(benchmark, con): +def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a null-heavy numeric result to a numpy-backed frame.""" q = ( "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " "CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range(500000) t(i)" @@ -120,14 +140,17 @@ def test_write_pandas_numpy_numeric_with_nulls(benchmark, con): benchmark(lambda: con.sql(q).df()) -def test_write_pandas_numpy_timestamp(benchmark, con): +def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a timestamp result to a numpy-backed frame.""" q = "SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range(500000) t(i)" benchmark(lambda: con.sql(q).df()) -def test_write_pandas_arrow_numeric(benchmark, con): +def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a numeric result to an arrow-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) -def test_write_pandas_arrow_string(benchmark, con): +def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark materializing a string result to an arrow-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py index cfe5c281..5ad56254 100644 --- a/benchmarks/test_produce_numpy_perf.py +++ b/benchmarks/test_produce_numpy_perf.py @@ -1,36 +1,33 @@ -"""Standalone CodSpeed benchmark module for the COLUMNAR produce paths (duckdb -> numpy/pandas), i.e. df(), -fetchnumpy(), fetch_df_chunk() — NOT integrated (not in pyproject, not in CI, not committed). Run under each -build's interpreter and compare: +"""CodSpeed benchmark: columnar produce paths (df(), fetchnumpy(), fetch_df_chunk()). Standalone, not in CI. - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): cd /Users/evert/projects/duckdb-python/wt-codspeed - $M -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - $C -m pytest benchmarks/test_produce_numpy_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - -WHY THIS MODULE: the columnar OUT path (FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular) is exactly -what the nanobind cutover reworked. The under-covered cases are: (1) the WITH-NULLS branch (HAS_NULLS=true -> -masked_array build -> masked->pd.NA rewrite, array_wrapper.cpp / pyresult.cpp) -- NEVER previously benchmarked -and the most-changed code; (2) datetime; (3) fetchnumpy without the DataFrame wrap; (4) fetch_df_chunk; and -the wide-internal types HUGEINT (->double cast), UUID (UUIDConvert), DECIMAL(28,x) (ConvertDecimalInternal -) that exercise distinct OUT-col converters. - -GOTCHA (encoded below): OUT-col NULL benchmarks use REAL DuckDB nulls (CASE WHEN .. THEN NULL). A no-null -column silently takes the cheap std::move path and the masked-array branch never triggers, so it would measure -the wrong thing. - -FULL CONSUME: df() / fetchnumpy() eagerly materialize the whole column set; fetch_df_chunk is drained in a loop. - -numpy/pandas are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_produce_numpy_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +Covers the with-NULLS branch (masked_array build), datetime, and wide-internal types (hugeint/uuid/decimal128). +Gotcha: NULL benchmarks use real DuckDB nulls (CASE WHEN); a no-null column takes the cheap path and measures +the wrong thing. Full consume: df()/fetchnumpy() materialize the columns; fetch_df_chunk is drained in a loop. """ +from __future__ import annotations + import gc import sys import tracemalloc +from typing import TYPE_CHECKING + +import pytest import duckdb import numpy as np # noqa: F401 (pinned identically A/B; imported so the env matches the other modules) -import pytest + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture N = 500_000 TYPE_N = 200_000 # wide-internal types (hugeint/uuid/decimal128) are heavier per cell @@ -48,18 +45,19 @@ @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() -def _bench_df(benchmark, con, query): +def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.sql(query).df() # warm benchmark(lambda: con.sql(query).df()) -def _bench_numpy(benchmark, con, query): +def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.sql(query).fetchnumpy() # warm benchmark(lambda: con.sql(query).fetchnumpy()) @@ -69,32 +67,39 @@ def _bench_numpy(benchmark, con, query): # --------------------------------------------------------------------------- # -def test_df_numeric(benchmark, con): +def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a numeric result.""" _bench_df(benchmark, con, Q_NUM) -def test_df_numeric_with_nulls(benchmark, con): +def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a null-heavy numeric result.""" # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch) _bench_df(benchmark, con, Q_NUM_NULLS) -def test_df_string(benchmark, con): +def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a string result.""" _bench_df(benchmark, con, Q_STR) -def test_df_timestamp(benchmark, con): +def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a timestamp result.""" _bench_df(benchmark, con, Q_TS) -def test_df_hugeint(benchmark, con): +def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a hugeint result.""" _bench_df(benchmark, con, Q_HUGEINT) -def test_df_uuid(benchmark, con): +def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a uuid result.""" _bench_df(benchmark, con, Q_UUID) -def test_df_decimal128(benchmark, con): +def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark df() of a 128-bit decimal result.""" _bench_df(benchmark, con, Q_DEC128) @@ -103,11 +108,13 @@ def test_df_decimal128(benchmark, con): # --------------------------------------------------------------------------- # -def test_fetchnumpy_numeric(benchmark, con): +def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchnumpy() of a numeric result.""" _bench_numpy(benchmark, con, Q_NUM) -def test_fetchnumpy_numeric_with_nulls(benchmark, con): +def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark fetchnumpy() of a null-heavy numeric result.""" _bench_numpy(benchmark, con, Q_NUM_NULLS) @@ -116,8 +123,10 @@ def test_fetchnumpy_numeric_with_nulls(benchmark, con): # --------------------------------------------------------------------------- # -def test_fetch_df_chunk_loop(benchmark, con): - def run(): +def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark draining a result with fetch_df_chunk().""" + + def run() -> int: rel = con.sql(Q_NUM) rows = 0 while True: @@ -136,7 +145,8 @@ def run(): # --------------------------------------------------------------------------- # -def test_torch_numeric(benchmark, con): +def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark torch() of a numeric result (skipped if torch is absent).""" pytest.importorskip("torch") q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)" con.sql(q).torch() # warm @@ -160,7 +170,8 @@ def test_torch_numeric(benchmark, con): # --------------------------------------------------------------------------- # -def test_mem_df_with_nulls(): +def test_mem_df_with_nulls() -> None: + """Guard the Python-tracked peak allocation of a null-heavy df() call.""" con = duckdb.connect() try: tracemalloc.start() diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py index 3e92f12d..7fb80c4b 100644 --- a/benchmarks/test_types_roundtrip_perf.py +++ b/benchmarks/test_types_roundtrip_perf.py @@ -1,31 +1,31 @@ -"""Standalone CodSpeed benchmark module: the TYPE x DIRECTION produce matrix — NOT integrated (not in -pyproject, not in CI, not committed). Run under each build's interpreter and compare: +"""CodSpeed benchmark: the type x direction produce matrix (fetchall / df / to_arrow per type). Standalone, not in CI. - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): cd /Users/evert/projects/duckdb-python/wt-codspeed - $M -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - $C -m pytest benchmarks/test_types_roundtrip_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - -WHY THIS MODULE: a single systematic sweep of one logical type per column across the three produce directions - * OUT-row = fetchall() -> FromValue per cell (python_objects.cpp) - * OUT-col = df() -> ArrayWrapper / ConvertColumn (array_wrapper.cpp) - * OUT-arrow = to_arrow_table() -> arrow export converters -so a regression localizes to (type, direction). Includes the iqmo/bareduckdb cross-check breadth that the -narrow-numeric homogeneous benchmarks miss: HUGEINT (PyLong_FromString / hugeint->double / int128 export), -UUID (uuid.UUID per row / UUIDConvert), DECIMAL(28,6) int128-internal (ConvertDecimalInternal), -and a long-varchar (>64 chars) that shifts the string paths from overhead-bound to copy-bound. - -FULL CONSUME: fetchall and df materialize everything; to_arrow_table is eager. NOTE: to_arrow_table on a -materialized result re-runs the query with the GIL released (PromoteMaterializedToArrow), so the OUT-arrow -column is engine-parallel and walltime-NOISY -- treat it as informational, not a hard gate. - -numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_types_roundtrip_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +One logical type per column across three directions, so a regression localizes to (type, direction). Includes +the wide types the narrow-numeric benchmarks miss: hugeint, uuid, decimal128, long varchar. Note: to_arrow on a +materialized result re-runs the query with the GIL released, so the arrow column is engine-parallel and +walltime-noisy: informational, not a hard gate. """ -import duckdb +from __future__ import annotations + +from typing import TYPE_CHECKING + import pytest +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture + N = 100_000 # one logical type per column; long-varchar is intentionally > 64 chars @@ -46,32 +46,36 @@ @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() -def _query(type_name): +def _query(type_name: str) -> str: return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)" @pytest.mark.parametrize("type_name", TYPES) -def test_out_row_fetchall(benchmark, con, type_name): +def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: + """Benchmark fetchall of one logical type per column.""" q = _query(type_name) con.execute(q).fetchall() # warm benchmark(lambda: con.execute(q).fetchall()) @pytest.mark.parametrize("type_name", TYPES) -def test_out_col_df(benchmark, con, type_name): +def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: + """Benchmark df() of one logical type per column.""" q = _query(type_name) con.sql(q).df() # warm benchmark(lambda: con.sql(q).df()) @pytest.mark.parametrize("type_name", TYPES) -def test_out_arrow_table(benchmark, con, type_name): +def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: + """Benchmark to_arrow_table() of one logical type per column (informational only).""" # informational only: PromoteMaterializedToArrow re-runs the query with the GIL released (noisy) q = _query(type_name) con.sql(q).to_arrow_table() # warm diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py index ef398ebb..34896bcc 100644 --- a/benchmarks/test_udf_perf.py +++ b/benchmarks/test_udf_perf.py @@ -1,28 +1,30 @@ -"""Standalone CodSpeed benchmark module for the Python UDF binding paths (src/python_udf.cpp) — NOT integrated -(not in pyproject, not in CI, not committed). Run under each build's interpreter and compare: +"""CodSpeed benchmark: Python UDF paths (native scalar + vectorized arrow). Standalone, not in CI. - M=/Users/evert/projects/duckdb-python/main/.venv-release/bin/python - C=/Users/evert/projects/duckdb-python/wt-codspeed/.venv-release/bin/python +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): cd /Users/evert/projects/duckdb-python/wt-codspeed - $M -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider - $C -m pytest benchmarks/test_udf_perf.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_udf_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done -WHY THIS MODULE: the whole UDF subsystem had ZERO benchmark coverage. The NATIVE scalar UDF is the single -biggest untested per-call-overhead path in the binding -- per row it builds a TupleBuilder of args, calls -PyObject_CallObject, and runs TransformPythonObject on the result (python_udf.cpp). The ARROW (vectorized) UDF -is the columnar counterpart: ConvertDataChunkToPyArrowTable + the Python call + ConvertArrowTableToVector cast. +Native scalar = one Python call per row (arg build + PyObject_CallObject + result transform); arrow = one call +per chunk. Full consume: each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row. +""" -FULL CONSUME (same discipline as the other modules): every UDF benchmark wraps the call in a sum()/length() -aggregate so the ENGINE evaluates the UDF on every row (count(*) would skip it). The aggregate output is a -single row, so the measured cost is the per-row (native) / per-chunk (arrow) UDF invocation, not the fetch. +from __future__ import annotations -numpy/pandas/pyarrow are pinned to the SAME versions in both .venv-release, so the A/B delta is purely the binding. -""" +from typing import TYPE_CHECKING -import duckdb import pytest + +import duckdb from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture + pa = pytest.importorskip("pyarrow") pc = pytest.importorskip("pyarrow.compute") @@ -31,13 +33,14 @@ @pytest.fixture -def con(): +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh connection, closed on teardown.""" c = duckdb.connect() yield c c.close() -def _bench(benchmark, con, query): +def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.execute(query).fetchall() # warm the engine + import caches before measuring benchmark(lambda: con.execute(query).fetchall()) @@ -47,22 +50,26 @@ def _bench(benchmark, con, query): # --------------------------------------------------------------------------- # -def test_udf_native_int_1arg(benchmark, con): +def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark a 1-arg native int scalar UDF.""" con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)") -def test_udf_native_int_2arg(benchmark, con): +def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark a 2-arg native int scalar UDF.""" con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT) _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)") -def test_udf_native_double_1arg(benchmark, con): +def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark a 1-arg native double scalar UDF.""" con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE) _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)") -def test_udf_native_string(benchmark, con): +def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark a native string scalar UDF.""" con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR) _bench( benchmark, @@ -71,7 +78,8 @@ def test_udf_native_string(benchmark, con): ) -def test_udf_native_null_inputs(benchmark, con): +def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark the validity short-circuit for NULL inputs to a native UDF.""" # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the # validity short-circuit, not the Python call, so the UDF only ever sees non-NULL rows. con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) @@ -88,17 +96,20 @@ def test_udf_native_null_inputs(benchmark, con): # --------------------------------------------------------------------------- # -def test_udf_arrow_int(benchmark, con): +def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark a vectorized arrow int UDF.""" con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)") -def test_udf_arrow_double(benchmark, con): +def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark a vectorized arrow double UDF.""" con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow") _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)") -def test_udf_arrow_null_inputs(benchmark, con): +def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Benchmark the selvec compaction for NULL inputs to a vectorized arrow UDF.""" # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call # and reconstructs the result vector afterwards -- this is the selvec compaction/reconstruction cost. con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") diff --git a/pyproject.toml b/pyproject.toml index 53cfa616..fd0ef328 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,6 +250,7 @@ test = [ # dependencies used for running tests "pytest-timeout", "pytest-timestamper", "pytest-xdist", # parallel test execution (-n auto); without this `uv sync --reinstall` prunes a manual install + "pytest_codspeed", "coverage", "gcovr; sys_platform != 'win32' or platform_machine != 'ARM64'", "gcsfs; sys_platform != 'win32' or platform_machine != 'ARM64'", From 049b7e316afcff2b3f551b936eabdfca97ab1d97 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Wed, 1 Jul 2026 18:29:07 +0200 Subject: [PATCH 4/7] more benchmarking --- .github/workflows/codspeed.yml | 120 +++++-- benchmarks/PLAN.md | 60 +++- benchmarks/_scale.py | 40 +++ benchmarks/compare_baseline.py | 338 ++++++++++++++++++ benchmarks/conftest.py | 63 ++++ benchmarks/requirements-bench.txt | 29 ++ benchmarks/test_arrow_perf.py | 54 ++- benchmarks/test_cardinality_perf.py | 12 +- benchmarks/test_concurrency_perf.py | 136 +++++++ benchmarks/test_engine_control_perf.py | 68 ++++ benchmarks/test_fetch_perf.py | 76 ++-- benchmarks/test_ingest_native_perf.py | 21 +- benchmarks/test_ingest_numpy_perf.py | 37 +- benchmarks/test_pandas_perf.py | 40 ++- benchmarks/test_produce_numpy_perf.py | 30 +- .../test_relational_construction_perf.py | 43 +++ benchmarks/test_types_roundtrip_perf.py | 23 +- benchmarks/test_udf_perf.py | 28 +- pyproject.toml | 16 + tests/fast/test_binding_pressure_leak.py | 113 ++++++ 20 files changed, 1177 insertions(+), 170 deletions(-) create mode 100644 benchmarks/_scale.py create mode 100644 benchmarks/compare_baseline.py create mode 100644 benchmarks/conftest.py create mode 100644 benchmarks/requirements-bench.txt create mode 100644 benchmarks/test_concurrency_perf.py create mode 100644 benchmarks/test_engine_control_perf.py create mode 100644 benchmarks/test_relational_construction_perf.py create mode 100644 tests/fast/test_binding_pressure_leak.py diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index c82465d2..2b7b1664 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -1,20 +1,42 @@ -# Perf-regression benchmarks via CodSpeed in instruction-count (simulation) mode: deterministic, so the whole -# suite is gate-able (no walltime noise, no gated/informational split). +# Perf-regression benchmarks: instruction-count (Callgrind) gating against a COMMITTED baseline. # -# TOKENLESS: the token is only for uploading to the CodSpeed dashboard. Without it the action still runs every -# benchmark and reports counts in the job log. For the hosted gate later, create a CodSpeed project and rely on -# the OIDC id-token permission below (public repo), or add a CODSPEED_TOKEN secret and pass token: to the action. +# NO CodSpeed account/dashboard/token/runner. pytest-codspeed's hooks call callgrind_dump_stats_at() per +# benchmark, so a self-hosted `valgrind --tool=callgrind` run writes one dump per benchmark, headed by +# `desc: Trigger: Client Request: ` with the count on `totals:` (events: Ir). benchmarks/compare_baseline.py +# parses those dumps and diffs each benchmark against benchmarks/baseline.json (the committed instruction-count +# baseline). Counts are near-deterministic under Callgrind with PYTHONHASHSEED pinned (~0.1% noise observed; +# often bit-identical), so a 5% default gate threshold sits far above noise. Validated on a Linux+valgrind box. # -# Not yet run in CI; the build mirrors the dev build (CLAUDE.md) and will need a shakeout. Valgrind is slow -# (~20-50x); trim the largest-N benchmarks if the suite is too slow. +# TRIGGERS: nightly `schedule` + manual `workflow_dispatch`. No pull_request/push (and no `paths:` -- neither +# schedule nor dispatch honors it). A dispatch on a feature branch compares that branch's benchmark counts vs the +# baseline.json committed on the branch (i.e. main's baseline), answering "did my branch regress vs main". +# +# MODES (workflow_dispatch input `regen`): +# regen=false (default) -> COMPARE: run + diff vs baseline.json, print a report. REPORT-ONLY for now (never +# fails the job); flip compare_baseline.py to --enforce once trusted. +# regen=true -> REGENERATE: run + write a fresh baseline.json (per-bench counts + provenance meta + +# Option-B binding fractions/auto-move) and upload it as an artifact to commit +# deliberately. Bump benchmarks/requirements-bench.txt in a separate commit FIRST if +# the pins should change, then regen so the baseline matches the committed pins. +# +# The concurrency module is EXCLUDED from the Callgrind sweep: Callgrind serializes threads, so its signal +# (wall-clock GIL contention) is meaningless here; it stays a local walltime tool. +# +# MEMORY MODE (a second Callgrind sweep for O(rows) produce peak-RSS) is DESIGNED but DEFERRED -- see PLAN.md. +# +# Valgrind is slow (~20-50x); timeout-minutes is a conservative guess -- calibrate after the first CI run. name: Benchmarks on: - pull_request: - push: - branches: [main] + schedule: + - cron: "0 3 * * *" # nightly at 03:00 UTC workflow_dispatch: + inputs: + regen: + description: "Regenerate benchmarks/baseline.json (upload as artifact) instead of comparing" + type: boolean + default: false concurrency: group: codspeed-${{ github.ref }} @@ -23,25 +45,42 @@ concurrency: jobs: benchmarks: runs-on: ubuntu-latest + timeout-minutes: 90 # measured: ~25 min Callgrind sweep at BENCH_SCALE=10 (12-core Linux) + cold build ~10 min; margin for CI permissions: contents: read - id-token: write # enables tokenless (OIDC) upload once a CodSpeed project is linked; harmless otherwise + env: + PYTHONHASHSEED: "0" # pin hash randomization so dict/struct paths give stable instruction counts (INFRA-6) + CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks (the callgrind_dump_stats_at markers) + # env-gated row counts (INFRA-4): shrink the O(rows)/per-row-object benchmarks so the Callgrind sweep fits + # under timeout-minutes. Local runs leave this unset -> full N. Recorded in baseline.json meta.bench_scale; + # a baseline is only comparable to a run at the SAME scale. Calibrated on a 12-core Linux+valgrind box: + # BENCH_SCALE=10 -> ~25 min full sweep, and the Option-B move-list matches full-N (fractions shift slightly + # but stay the same side of the cutoff). Most benches floor at 20k rows (_scale.FLOOR), still row-dominated. + BENCH_SCALE: "10" steps: - uses: actions/checkout@v4 with: submodules: recursive # the DuckDB engine submodule is needed to build fetch-depth: 0 # setuptools_scm needs history for version detection + - name: Resolve DuckDB submodule SHA + id: duckdb_sha + # used for the sccache key AND passed to compare_baseline.py for the engine-bump guard + run: echo "sha=$(git rev-parse HEAD:external/duckdb)" >> "$GITHUB_OUTPUT" + - name: Install uv uses: astral-sh/setup-uv@v5 with: python-version: "3.13" + - name: Install valgrind + run: sudo apt-get update && sudo apt-get install -y valgrind + - name: Cache sccache uses: actions/cache@v4 with: path: ~/.cache/sccache - key: sccache-codspeed-${{ hashFiles('external/duckdb') }} + key: sccache-codspeed-${{ steps.duckdb_sha.outputs.sha }} restore-keys: sccache-codspeed- - name: Install sccache @@ -49,19 +88,58 @@ jobs: curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \ | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache - - name: Build the extension (release) + benchmark deps + - name: Build the extension (release) + pinned benchmark deps env: CMAKE_C_COMPILER_LAUNCHER: sccache CMAKE_CXX_COMPILER_LAUNCHER: sccache run: | + # step 1: build deps only (needed for --no-build-isolation), no project uv sync --only-group build --no-install-project -p 3.13 - uv sync --no-build-isolation --no-editable --reinstall -p 3.13 - # benchmark deps: keep these pinned in lockstep with any baseline you compare against, so the only - # cross-run delta is the binding (numpy/pandas/pyarrow/polars/pytz + the codspeed plugin). - uv pip install pytest pytest-codspeed numpy pandas pyarrow polars pytz + # step 2: build+install the project (release) + build group, WITHOUT the heavy default `dev` group + # (torch/tensorflow/pyspark). uv.lock is gitignored, so it is deliberately NOT relied on for bench deps. + uv sync --no-build-isolation --no-editable --reinstall --no-default-groups --group build -p 3.13 + # step 3: install the FROZEN, committed bench pins (exact ==). Regenerated deliberately with the baseline + # (source list: pyproject [dependency-groups] bench), so the only cross-run delta is the binding. + uv pip install -r benchmarks/requirements-bench.txt + + - name: Collect gate node-ids + # the gate/informational split (conftest markers) classifies which benchmarks are gate-able; regen uses it + run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \ + | grep '::' > gate_list.txt || true + + - name: Run benchmarks under Callgrind (per-benchmark instruction counts) + # ONE sweep over all gate+informational benchmarks EXCEPT the concurrency module (Callgrind serializes + # threads -> its wall-clock signal is meaningless and it is expensive). Each benchmark emits a callgrind + # dump keyed by its uri. The pytest-codspeed hooks obj-skip libpython, so counts are clean. + run: | + mkdir -p profiles + CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \ + --callgrind-out-file="$PWD/profiles/cg.%p.%n" \ + uv run --no-sync pytest benchmarks/ \ + --ignore=benchmarks/test_concurrency_perf.py \ + -m "gate or informational" --codspeed -o addopts= -p no:cacheprovider + + - name: Compare against committed baseline (report-only) + if: ${{ !inputs.regen }} + # report-only for now: prints the per-benchmark delta table and NEVER fails the job. Add --enforce here + # once trusted to fail on a gate regression (informational benches never fail). + run: | + uv run --no-sync python benchmarks/compare_baseline.py compare \ + --profiles profiles --baseline benchmarks/baseline.json \ + --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \ + --pins benchmarks/requirements-bench.txt + + - name: Regenerate baseline (upload artifact to commit deliberately) + if: ${{ inputs.regen }} + run: | + uv run --no-sync python benchmarks/compare_baseline.py regen \ + --profiles profiles --out benchmarks/baseline.json --gate-list gate_list.txt \ + --git-commit "${{ github.sha }}" --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \ + --pins benchmarks/requirements-bench.txt - - name: Run benchmarks (instruction-count) - uses: CodSpeedHQ/action@v4 + - name: Upload regenerated baseline + if: ${{ inputs.regen }} + uses: actions/upload-artifact@v4 with: - mode: simulation - run: uv run pytest benchmarks/ --codspeed -o addopts= -p no:cacheprovider + name: baseline-update + path: benchmarks/baseline.json diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md index c04f4801..54786083 100644 --- a/benchmarks/PLAN.md +++ b/benchmarks/PLAN.md @@ -122,20 +122,58 @@ benchmarks/ ``` One module per binding subsystem so a CodSpeed report points at one src/ area. torch/tf go in produce_numpy (wrap FetchNumpyInternal); polars stays in arrow (wraps FetchArrowTable). +> **Note (reconciled to the implemented model).** The prose below originally described a per-PR CodSpeed +> commit-diff gate. That is NOT how the suite works now. The implemented model is: **nightly `schedule` + +> manual `workflow_dispatch`** (no per-PR trigger, no CodSpeed account/token/runner), a **self-hosted +> `valgrind --tool=callgrind`** sweep that emits one dump per benchmark, and **`compare_baseline.py`** diffing +> those counts against a **committed `benchmarks/baseline.json`**. See `.github/workflows/codspeed.yml`. + ### Walltime vs instruction-count - **Local A/B (macOS arm64): walltime only** (no Valgrind), `--codspeed-mode=walltime`. -- **CI gate: instruction-count / simulation (Linux + Callgrind)**, deterministic — gate PRs with this. - -Instruction-count is ideal AND should gate the GIL-held single-threaded overhead paths: fetchone loop, fetchall/fetchmany, native UDF per-call, native values() ingest, analyzer bind, all per-element converters (FromValue, TransformPythonValue, NumpyScan object/string, ArrayWrapper fill). The historical fetchall regression would be caught cleanly here. - -Noisy under instruction-count — keep walltime-only, informational, do NOT hard-gate: -- to_arrow_table / pl() on materialized results: PromoteMaterializedToArrow re-runs the query parallel with GIL released (`pyresult.cpp:450-477`). -- Large 1M+ SELECT sum() ingest reads: engine parallel aggregate dominates. -- read_csv/parquet/json: engine + I/O dominated. -- GIL-per-chunk streaming (FetchNextRaw, to_record_batch_reader drain). - -Gate tactic: pair each large-throughput scenario with a small/1-row variant (e.g. fetchall range(1_000_000) walltime + fetchall range(2048) instruction-count gate) so binding fixed-cost is measured noise-free. +- **CI: instruction-count via self-hosted Callgrind (Linux)**, near-deterministic (~0.1% noise with + `PYTHONHASHSEED=0`; often bit-identical) — compared against the committed baseline, **report-only** for now + (flip `compare_baseline.py` to `--enforce` when trusted). + +### Marker split + committed-baseline gate (INFRA-1 / Phase-3) + +- Every benchmark carries exactly one of `@pytest.mark.gate` / `@pytest.mark.informational` (registered in + `conftest.py`). **gate** = binding-dominated, instruction-count-meaningful (fetchone loop, fetchall/fetchmany, + df()/fetchnumpy, native UDF per-call, native values()/executemany ingest, analyzer bind, per-element + converters). **informational** = engine/library/streaming-diluted, reported but never gated + (`to_arrow_table`/`pl()`/`to_pandas` GIL-released re-runs; registered-frame `SELECT sum()` reads; + streaming drains; the concurrency module). +- **Engine floors + Option-B (MEAS-1).** `test_engine_control_perf.py` measures `SELECT sum(...) FROM range(N)` + with no Python egress — the engine floor. At baseline **regen**, each mapped numeric-produce gate's binding + fraction `= 1 - floor_Ir/bench_Ir` is computed; a gate below the ~25% cutoff is **auto-moved to + informational** (a threshold on an engine-diluted total is not meaningful) and the fraction is stored in + `baseline.json` for audit. MEAS-1 showed OUT-row fetch and UDFs are ~all binding (stay gate); numeric + produce (`df()`/`fetchnumpy`) is a bulk memcpy of ~engine magnitude (auto-move candidate). +- **Small-N gates are compile+fetch fixed-cost**, not pure fetch (MEAS-1: ~60% compile+engine at `range(2048)`). +- **Engine-bump guard.** `compare_baseline.py` compares the committed submodule SHA against the baseline's; if + they differ, engine-inclusive deltas may reflect the engine bump, so gate deltas are not enforced (regen the + baseline for the new engine). +- **Reproducibility.** `benchmarks/requirements-bench.txt` (frozen `==` pins, from the `[dependency-groups] + bench` list) + `benchmarks/baseline.json` are the co-regenerated pair; CI installs the frozen pins (NOT the + gitignored `uv.lock`), so the only cross-run delta is the binding. + +Still **informational / do NOT gate** (engine/parallel/IO/library dominated): +- to_arrow_table / pl() on materialized results (PromoteMaterializedToArrow re-runs GIL-released). +- registered-frame `SELECT sum()` ingest reads (engine aggregate dominates). +- read_csv/parquet/json; GIL-per-chunk streaming drains. + +### New coverage dimensions (beyond the converter surface) + +- **Concurrency/GIL** (`test_concurrency_perf.py`, informational/walltime): threads {1,4,8} over a **multi-batch** + arrow scan / pandas scan / native + arrow UDF. EXCLUDED from the Callgrind sweep (Callgrind serializes threads + → its wall-clock contention signal is meaningless there); it is a local walltime tool. +- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain psutil RSS + object-count + ratio test (not a codspeed benchmark) for the object-pinning paths (register/unregister, UDF create/run/remove, + executemany). Runs in the normal test suite. +- **Memory mode (DEFERRED).** A second Callgrind sweep (`--codspeed-mode=memory`) over the O(rows) produce paths + for peak-RSS, feeding the same baseline model, is DESIGNED but not implemented this round (roughly doubles the + CI cost; nightly-only when added). The `test_mem_df_with_nulls` tracemalloc guard stays as a local signal until + then (convert it to an A/B delta when memory mode lands). ### Two code-grounded gotchas - **OUT-col null benchmarks need REAL DuckDB nulls** (`CASE WHEN ... THEN NULL`): the masked-array branch only triggers on an actually-invalid validity bit (`array_wrapper.cpp:396-404,736`); a no-null column silently takes the cheap `std::move` path and measures the wrong thing. diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py new file mode 100644 index 00000000..b641662f --- /dev/null +++ b/benchmarks/_scale.py @@ -0,0 +1,40 @@ +"""Env-gated row-count scaling for the benchmark suite (INFRA-4). + +Callgrind is 20-50x, and the O(rows) / per-row-object benchmarks at full N make the CI sweep too slow. `scaled(n)` +shrinks those row counts ONLY when an explicit `BENCH_SCALE=` env var is set (which the CI Callgrind +sweep sets). Unset -> full N, so LOCAL walltime A/B keeps the large N unchanged. + +CRITICAL: a gate benchmark and the engine-control floor it is compared against (the FLOOR_MAP pairs in +compare_baseline.py) share the same base N, so routing BOTH through `scaled()` keeps them at an identical scaled +N -- the Option-B binding_fraction stays valid. Scaling ONLY reduces row counts; it must never change the data +patterns the benchmarks depend on (real NULLs, mixed ASCII+non-ASCII+null, LIMIT-no-ORDER-BY, warm-before-measure). + +A floor keeps a scaled benchmark row-dominated (well above the range(2048) fixed-cost probes), so per-element +work still dominates and the fraction/signal stay meaningful. The small-N `*_gate` probes are NOT routed through +this (they are already fast and are the fixed-cost baseline). +""" + +from __future__ import annotations + +import os + +FLOOR = 20_000 # a scaled bench never drops below this (stays row-dominated, ~10x the range(2048) probes) + + +def bench_scale() -> int: + """Return the divisor from `BENCH_SCALE` (>=1); 1 (no scaling) if unset/invalid.""" + v = os.environ.get("BENCH_SCALE") + if not v: + return 1 + try: + return max(int(v), 1) + except ValueError: + return 1 + + +def scaled(n: int) -> int: + """Return `n` at full scale, or `max(n // BENCH_SCALE, min(n, FLOOR))` when scaling is enabled.""" + d = bench_scale() + if d <= 1: + return n + return max(n // d, min(n, FLOOR)) diff --git a/benchmarks/compare_baseline.py b/benchmarks/compare_baseline.py new file mode 100644 index 00000000..85e96c43 --- /dev/null +++ b/benchmarks/compare_baseline.py @@ -0,0 +1,338 @@ +#!/usr/bin/env python3 +"""Committed-baseline instruction-count comparison for the CodSpeed benchmark suite. + +WHY / HOW (grounded, verified on a Linux+valgrind box): + The suite runs under `valgrind --tool=callgrind` with pytest-codspeed. pytest-codspeed's hooks call + `callgrind_dump_stats_at()` at the end of each benchmark, so callgrind writes ONE dump file per + benchmark, headed by `desc: Trigger: Client Request: ` with the instruction count on the `totals:` + line (`events: Ir`). The hooks also obj-skip libpython, so counts are clean. NO CodSpeed account, token, or + runner binary is involved -- this parses the raw callgrind dumps directly. + + Observed run-to-run noise on that box was ~0.1% (callgrind is near-deterministic, not bit-identical), so the + default gate threshold (5%) sits far above noise. PYTHONHASHSEED is pinned in CI to keep dict/struct paths + stable. + +TWO MODES: + regen -- build benchmarks/baseline.json from a fresh valgrind run: per-benchmark instruction counts + + provenance meta + (for the mapped numeric-produce gates) the engine-diluted binding fraction, and + the Option-B auto-move of any gate below the cutoff to `informational`. + compare -- parse a fresh valgrind run, diff each benchmark against baseline.json, and print a report. GATE + benchmarks over their threshold are regressions; `informational` benchmarks are reported only. + REPORT-ONLY by default (always exit 0); `--enforce` exits non-zero on a gate regression. + +Both are CI-only in practice (no valgrind on macOS arm64). baseline.json and benchmarks/requirements-bench.txt +are regenerated together (same job) so the counts always correspond to the frozen data-lib pins. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import sys +from datetime import datetime, timezone +from pathlib import Path + +SCHEMA_VERSION = 1 +GATE_DEFAULT_THRESHOLD_PCT = 5.0 +BINDING_FRACTION_CUTOFF = 0.25 # Option-B: a gate whose isolable binding fraction is below this is auto-moved +# to informational (a threshold on its engine-diluted total is not meaningful). + +# Option-B floor map: the engine-control benchmark whose instruction count is the "engine floor" of a given +# numeric-produce gate. binding_fraction = 1 - floor_Ir / bench_Ir. ONLY the numeric-produce benches are listed: +# MEAS-1 showed their per-element binding is a bulk memcpy (~engine magnitude); every other gate (OUT-row fetch +# of any type, string/nested/decimal/hugeint/uuid produce, UDFs, native ingest, analyzer bind) is high-binding +# and needs no fraction. Add a mapping (and, if needed, an engine floor) here to evaluate more benches. +_E = "benchmarks/test_engine_control_perf.py" +FLOOR_MAP = { + "benchmarks/test_produce_numpy_perf.py::test_df_numeric": f"{_E}::test_engine_sum_2col_500k", + "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": f"{_E}::test_engine_sum_2col_500k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": f"{_E}::test_engine_sum_1col_100k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": f"{_E}::test_engine_sum_1col_100k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": f"{_E}::test_engine_sum_1col_100k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": f"{_E}::test_engine_sum_1col_100k", +} + +_TRIGGER_RE = re.compile(r"^desc:\s*Trigger:\s*Client Request:\s*(?P.+?)\s*$") +_TOTALS_RE = re.compile(r"^totals:\s*(?P\d+)\s*$") + + +# --------------------------------------------------------------------------- # +# callgrind parsing +# --------------------------------------------------------------------------- # + + +def _normalize_uri(raw: str) -> str: + """Return a repo-relative benchmark key. + + Inside a git repo pytest-codspeed already emits a git-relative uri (e.g. `benchmarks/x.py::test[p]`); this + defensively strips a leading absolute path if the run happened outside a git repo. + """ + raw = raw.strip() + if "::" not in raw: + return raw + path, _, rest = raw.partition("::") + idx = path.find("benchmarks/") + if idx > 0: + path = path[idx:] + return f"{path}::{rest}" + + +def parse_profiles(profile_dir: Path) -> dict[str, int]: + """Parse every callgrind dump in `profile_dir`; return {benchmark_uri: instruction_count}. + + Only dumps whose Trigger is a benchmark Client Request (contains `::`) are kept; the metadata and + program-termination dumps are skipped. If a uri appears more than once (should not happen) the max is kept. + """ + counts: dict[str, int] = {} + files = sorted(profile_dir.rglob("*")) if profile_dir.exists() else [] + for f in files: + if not f.is_file(): + continue + uri: str | None = None + ir: int | None = None + try: + text = f.read_text(errors="replace") + except (OSError, UnicodeError): + continue + for line in text.splitlines(): + m = _TRIGGER_RE.match(line) + if m: + uri = _normalize_uri(m.group("uri")) + continue + m = _TOTALS_RE.match(line) + if m: + ir = int(m.group("ir")) + if uri and "::" in uri and ir is not None: + counts[uri] = max(counts.get(uri, 0), ir) + return counts + + +# --------------------------------------------------------------------------- # +# helpers +# --------------------------------------------------------------------------- # + + +def _sha256(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() if path.exists() else "" + + +def _load_gate_set(gate_list: Path | None) -> set[str]: + """Load the set of gate benchmark uris from a `pytest -m gate --collect-only -q` node-id list.""" + if not gate_list or not gate_list.exists(): + return set() + out = set() + for raw in gate_list.read_text().splitlines(): + line = raw.strip() + if "::" in line: # a pytest node-id (the workflow pre-filters the collect-only output to '::' lines) + out.add(_normalize_uri(line)) + return out + + +def _pct(base: int, new: int) -> float: + return 0.0 if base == 0 else (new - base) / base * 100.0 + + +# --------------------------------------------------------------------------- # +# regen +# --------------------------------------------------------------------------- # + + +def regen(args: argparse.Namespace) -> int: + """Write baseline.json from a valgrind run: counts + provenance + Option-B binding fractions/auto-move.""" + counts = parse_profiles(Path(args.profiles)) + if not counts: + print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr) + return 2 + gate_set = _load_gate_set(Path(args.gate_list) if args.gate_list else None) + + benches: dict[str, dict] = {} + auto_moved: list[str] = [] + for uri, ir in sorted(counts.items()): + source_marker = "gate" if uri in gate_set else "informational" + marker = source_marker + binding_fraction = None + floor_uri = FLOOR_MAP.get(uri) + if source_marker == "gate" and floor_uri and floor_uri in counts and ir > 0: + binding_fraction = round(max(0.0, 1.0 - counts[floor_uri] / ir), 4) + if binding_fraction < args.cutoff: + marker = "informational" # Option-B auto-move: engine-diluted, threshold not meaningful + auto_moved.append(uri) + benches[uri] = { + "marker": marker, + "source_marker": source_marker, + "auto_moved": marker != source_marker, + "instructions": ir, + "binding_fraction": binding_fraction, + "threshold_pct": GATE_DEFAULT_THRESHOLD_PCT if marker == "gate" else None, + } + + baseline = { + "meta": { + "schema_version": SCHEMA_VERSION, + "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "git_commit": args.git_commit, + "duckdb_submodule_sha": args.submodule_sha, + "requirements_bench_sha256": _sha256(Path(args.pins)) if args.pins else "", + "measurement": {"tool": "valgrind callgrind", "event": "Ir", "pythonhashseed": "0"}, + "bench_scale": os.environ.get("BENCH_SCALE", ""), # counts are only comparable at the same scale + "gate_default_threshold_pct": GATE_DEFAULT_THRESHOLD_PCT, + "binding_fraction_cutoff": args.cutoff, + "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above.", + }, + "benchmarks": benches, + } + Path(args.out).write_text(json.dumps(baseline, indent=2) + "\n") + n_gate = sum(1 for b in benches.values() if b["marker"] == "gate") + n_info = len(benches) - n_gate + print(f"Wrote {args.out}: {len(benches)} benchmarks ({n_gate} gate, {n_info} informational).") + if auto_moved: + print(f"Option-B auto-moved {len(auto_moved)} engine-diluted gate(s) to informational:") + for uri in auto_moved: + print(f" {uri} (binding_fraction={benches[uri]['binding_fraction']})") + print("Recommend updating these benches' @pytest.mark.gate -> informational so code matches the baseline.") + return 0 + + +# --------------------------------------------------------------------------- # +# compare +# --------------------------------------------------------------------------- # + + +def compare(args: argparse.Namespace) -> int: + """Diff a fresh valgrind run against baseline.json and print a report (report-only unless --enforce).""" + new_counts = parse_profiles(Path(args.profiles)) + if not new_counts: + print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr) + return 2 + baseline_path = Path(args.baseline) + if not baseline_path.exists(): + # Bootstrap state: no committed baseline yet. Report the run and instruct to regenerate; never fail. + print(f"No baseline at {baseline_path} yet -- run the workflow with regen=true to create it.") + print(f"This run produced {len(new_counts)} benchmark instruction counts.") + return 0 + baseline = json.loads(baseline_path.read_text()) + meta = baseline.get("meta", {}) + base_benches = baseline.get("benchmarks", {}) + + # scale guard: a baseline built at BENCH_SCALE=X is only comparable to a run at the same scale. + run_scale = os.environ.get("BENCH_SCALE", "") + base_scale = meta.get("bench_scale", "") + if run_scale != base_scale: + print( + f"WARNING: BENCH_SCALE differs (run={run_scale!r}, baseline={base_scale!r}) -> instruction counts are " + "not comparable. Regenerate the baseline at this scale." + ) + + # pin-drift guard: the baseline's counts only compare cleanly against the pinned data libs it was built with. + if args.pins: + cur = _sha256(Path(args.pins)) + base_pins = meta.get("requirements_bench_sha256", "") + if cur and base_pins and cur != base_pins: + print( + "WARNING: benchmarks/requirements-bench.txt differs from the baseline's pins -> data-lib deltas " + "may not be pure binding. Regenerate the baseline with the current pins." + ) + + # engine-bump guard: engine-inclusive counts shift when the bundled DuckDB submodule changes, for reasons + # unrelated to the binding. If the current submodule SHA differs from the baseline's, do not treat gate + # deltas as hard failures (they may reflect the engine bump); warn to regenerate the baseline. + engine_changed = bool( + args.submodule_sha and meta.get("duckdb_submodule_sha") and args.submodule_sha != meta["duckdb_submodule_sha"] + ) + + regressions: list[str] = [] + rows: list[tuple[str, str, str]] = [] # (status, uri, detail) + for uri, ir in sorted(new_counts.items()): + b = base_benches.get(uri) + if b is None: + rows.append(("NEW", uri, f"{ir} Ir (no baseline)")) + continue + base_ir = b["instructions"] + delta = _pct(base_ir, ir) + marker = b.get("marker", "informational") + thr = b.get("threshold_pct") or GATE_DEFAULT_THRESHOLD_PCT + detail = f"{base_ir} -> {ir} Ir ({delta:+.2f}%, thr {thr:.1f}%, {marker})" + if marker == "gate" and delta > thr: + if engine_changed: + rows.append(("ENGINE?", uri, detail + " [submodule changed -> not enforced]")) + else: + rows.append(("REGRESSION", uri, detail)) + regressions.append(uri) + else: + rows.append(("ok" if marker == "gate" else "info", uri, detail)) + rows.extend( + ("MISSING", uri, "in baseline, absent from run (rename/removal?)") + for uri in sorted(set(base_benches) - set(new_counts)) + ) + + _print_report(meta, rows, engine_changed=engine_changed, enforce=args.enforce) + + if not args.enforce: + return 0 + if engine_changed: + print("\nNOT ENFORCING: DuckDB submodule differs from the baseline; regenerate the baseline.") + return 0 + return 1 if regressions else 0 + + +def _print_report(meta: dict, rows: list[tuple[str, str, str]], *, engine_changed: bool, enforce: bool) -> None: + mode = "ENFORCING" if enforce else "REPORT-ONLY (not failing the job)" + print("=" * 100) + print(f"CodSpeed instruction-count baseline comparison [{mode}]") + print( + f"baseline: commit {meta.get('git_commit', '?')[:12]} submodule {str(meta.get('duckdb_submodule_sha'))[:12]}" + f" generated {meta.get('generated_at_utc', '?')}" + ) + if engine_changed: + print( + "WARNING: DuckDB submodule SHA differs from the baseline -> engine-inclusive deltas may reflect the " + "engine bump, not the binding. Regenerate the baseline for this engine." + ) + print("=" * 100) + order = {"REGRESSION": 0, "ENGINE?": 1, "MISSING": 2, "NEW": 3, "ok": 4, "info": 5} + for status, uri, detail in sorted(rows, key=lambda r: (order.get(r[0], 9), r[1])): + print(f" [{status:>10}] {uri}\n {detail}") + n_reg = sum(1 for s, _, _ in rows if s == "REGRESSION") + print("-" * 100) + print(f"Summary: {len(rows)} benchmarks, {n_reg} gate regression(s)" + ("" if enforce else " (report-only)")) + + +# --------------------------------------------------------------------------- # +# cli +# --------------------------------------------------------------------------- # + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point: dispatch to the `regen` or `compare` subcommand.""" + p = argparse.ArgumentParser(description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + + r = sub.add_parser("regen", help="write baseline.json from a valgrind run") + r.add_argument("--profiles", required=True, help="CODSPEED_PROFILE_FOLDER with callgrind dumps") + r.add_argument("--out", default="benchmarks/baseline.json") + r.add_argument("--gate-list", help="file of gate node-ids (pytest -m gate --collect-only -q)") + r.add_argument("--git-commit", default="") + r.add_argument("--submodule-sha", default="") + r.add_argument("--pins", default="benchmarks/requirements-bench.txt") + r.add_argument("--cutoff", type=float, default=BINDING_FRACTION_CUTOFF) + r.set_defaults(func=regen) + + c = sub.add_parser("compare", help="compare a valgrind run against baseline.json") + c.add_argument("--profiles", required=True) + c.add_argument("--baseline", default="benchmarks/baseline.json") + c.add_argument("--submodule-sha", default="") + c.add_argument( + "--pins", default="benchmarks/requirements-bench.txt", help="warn if pins differ from the baseline's" + ) + c.add_argument("--enforce", action="store_true", help="exit non-zero on a gate regression (default: report-only)") + c.set_defaults(func=compare) + + args = p.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py new file mode 100644 index 00000000..b1ccd604 --- /dev/null +++ b/benchmarks/conftest.py @@ -0,0 +1,63 @@ +"""Shared fixtures + marker registration for the CodSpeed benchmark suite. + +Central home (INFRA-6) for the `con` fixture, the `threads=1` isolation default, and the gate/informational +marker registration (INFRA-1). Markers are registered HERE (not via pyproject `markers=`) to keep the suite +self-contained. Registration is REQUIRED: pyproject sets `filterwarnings = ["error"]`, so an unregistered +mark would raise `PytestUnknownMarkWarning` as a collection error. + +Marker semantics + gate Binding-dominated, GIL-held, deterministic under Callgrind (instruction-count). These are the + paths where a threshold breach means a *binding* regression. Gate-able. (Enforcement against a + committed baseline is a later phase; for now they run and report.) + informational Engine/parallel/IO/library-diluted, streaming drains, or arrow-export re-run paths. Reported, + never gated: their instruction count is dominated by non-binding work (engine aggregate, the + bundled DuckDB submodule, pyarrow/polars library code), so gating them would false-positive on + engine/submodule bumps rather than catch binding regressions. + +Every benchmark (a test using the `benchmark` fixture) must carry EXACTLY ONE of these markers so the two CI +steps (`-m gate`, `-m informational`) together cover the suite with no overlap. Non-benchmark guards (e.g. the +tracemalloc assertion in test_produce_numpy_perf.py) are intentionally left unmarked and run in neither step. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import bench_scale, scaled # noqa: F401 (re-exported here as the shared home; used by the modules) + +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + +# ENV-GATED ROW COUNTS (INFRA-4): the O(rows) / per-row-object benchmarks route their N through `scaled()` +# (benchmarks/_scale.py). Unset `BENCH_SCALE` -> full N (local walltime A/B is unchanged); the CI Callgrind +# sweep sets `BENCH_SCALE=` to shrink N so the sweep fits under the job timeout. A gate benchmark and +# its engine-control floor (FLOOR_MAP in compare_baseline.py) share a base N, so both scale identically and the +# Option-B binding fraction stays valid. Scaling changes ONLY row counts, never the Do-NOT-regress data patterns. + + +def pytest_configure(config: pytest.Config) -> None: + """Register the gate/informational markers (required under filterwarnings=error).""" + config.addinivalue_line( + "markers", + "gate: binding-dominated, instruction-count gate-able under Callgrind (deterministic).", + ) + config.addinivalue_line( + "markers", + "informational: engine/library-diluted or streaming; reported, never gated.", + ) + + +@pytest.fixture +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh single-threaded connection, closed on teardown. + + `threads=1` pins engine parallelism so per-run instruction counts and walltime do not shift with the CI + runner core count (INFRA-6). The concurrency module (COV-1, a later phase) overrides this deliberately. + """ + c = duckdb.connect(config={"threads": 1}) + yield c + c.close() diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt new file mode 100644 index 00000000..e230d715 --- /dev/null +++ b/benchmarks/requirements-bench.txt @@ -0,0 +1,29 @@ +# Frozen, exact pins for the CodSpeed benchmark suite (.github/workflows/codspeed.yml). +# +# WHY a committed pin file (and NOT the gitignored uv.lock, and NOT a re-resolving `>=` group): CodSpeed +# compares instruction counts across runs. If a data lib (numpy/pandas/pyarrow/polars) changed version between +# the baseline run and a later run, that delta would be misattributed to the binding. These pins freeze the data +# libs so the ONLY cross-run delta is the binding. Regenerate this file DELIBERATELY, together with the baseline. +# +# SOURCE OF TRUTH: the human-readable `[dependency-groups] bench` list in pyproject.toml. Regenerate with: +# uv pip compile pyproject.toml --group bench \ +# --python-version 3.13 --python-platform x86_64-unknown-linux-gnu \ +# --no-annotate --no-header -o benchmarks/requirements-bench.txt +# (py3.13 / linux-x86_64 is the CI target.) torch/tensorflow are deliberately absent (local-only via importorskip). +iniconfig==2.3.0 +markdown-it-py==4.2.0 +mdurl==0.1.2 +numpy==2.5.0 +packaging==26.2 +pandas==3.0.3 +pluggy==1.6.0 +polars==1.42.1 +polars-runtime-32==1.42.1 +pyarrow==24.0.0 +pygments==2.20.0 +pytest==9.1.1 +pytest-codspeed==5.0.3 +python-dateutil==2.9.0.post0 +pytz==2026.2 +rich==15.0.0 +six==1.17.0 diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py index 244663bc..0fd8461f 100644 --- a/benchmarks/test_arrow_perf.py +++ b/benchmarks/test_arrow_perf.py @@ -17,25 +17,26 @@ import pyarrow as pa import pytest +from _scale import scaled -import duckdb +import numpy as np if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture -N = 500_000 -WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)" -WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)" + import duckdb +N = scaled(500_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) +DICT_UNIQUE = [2, 1_000, 50_000] # cardinality sweep: UNIQUE-value counts (not row counts) -> NOT scaled +WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" +WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() +# informational: every benchmark here is engine-parallel or library/streaming dominated -> reported, not gated. +# READ (sum over registered arrow) -> engine aggregate dominates; the near-zero-copy scan is a small fraction. +# WRITE to_arrow_table/to_arrow_reader/pl() -> PromoteMaterializedToArrow re-runs the query GIL-released +# (engine-parallel), and pl() also runs polars library code. Their counts would trip on engine/submodule +# bumps, not binding regressions. `con` fixture + threads=1 live in conftest.py. +pytestmark = pytest.mark.informational @pytest.fixture(scope="module") @@ -62,6 +63,18 @@ def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.R return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000) +@pytest.fixture(scope="module") +def arrow_dict_tables() -> dict[int, pa.Table]: + """Return dictionary-encoded arrow tables keyed by number of unique values (a cardinality sweep).""" + # deterministic indices (i % U) so the instruction count is reproducible (no PRNG) + tables = {} + for u in DICT_UNIQUE: + uniques = pa.array([f"category_value_{i}" for i in range(u)], type=pa.string()) + idx = pa.array(np.arange(N, dtype="int32") % u, type=pa.int32()) + tables[u] = pa.table({"c": pa.DictionaryArray.from_arrays(idx, uniques)}) + return tables + + # --------------------------------------------------------------------------- # # READ: arrow -> duckdb. The engine must scan every value (sum/length force it). # --------------------------------------------------------------------------- # @@ -72,12 +85,14 @@ def test_read_arrow_numeric( ) -> None: """Benchmark scanning a numeric arrow table.""" con.register("t_num", arrow_numeric) + con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()) def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None: """Benchmark scanning a string arrow table.""" con.register("t_str", arrow_string) + con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()) @@ -103,6 +118,21 @@ def run() -> list: benchmark(run) +# ADDED (COV-4): dictionary-encoded arrow ingest, cardinality sweep (unique in {2, 1k, high}). Mirrors core's +# test_arrow_dictionaries_scan. The engine aggregate dominates (hence informational), but the per-value +# dictionary DECODE in the arrow scan is the binding interest, and its cost slopes with the unique count. + + +@pytest.mark.parametrize("unique", DICT_UNIQUE) +def test_read_arrow_dictionary( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_dict_tables: dict[int, pa.Table], unique: int +) -> None: + """Benchmark scanning a dictionary-encoded arrow column at a given cardinality.""" + con.register("t_dict", arrow_dict_tables[unique]) + con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall()) + + # --------------------------------------------------------------------------- # # WRITE: duckdb -> arrow, consumer fully materializes / fully drains the stream. # --------------------------------------------------------------------------- # diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py index bf49dfc1..6e7af136 100644 --- a/benchmarks/test_cardinality_perf.py +++ b/benchmarks/test_cardinality_perf.py @@ -18,6 +18,7 @@ from typing import TYPE_CHECKING import pytest +from _scale import scaled import duckdb @@ -26,8 +27,10 @@ from pytest_codspeed import BenchmarkFixture -SRC_ROWS = 200_000 -LIMITS = [100, 1_000, 10_000, 100_000] +# env-gated (INFRA-4): scale the source rows AND the top-N of the sweep by the same factor, keeping the small-N +# points fixed and SRC_ROWS >= max(LIMITS). Preserves the LIMIT-no-ORDER-BY early-stop pattern (Do-NOT-regress). +SRC_ROWS = scaled(200_000) +LIMITS = [100, 1_000, 10_000, scaled(100_000)] @pytest.fixture(scope="module") @@ -35,7 +38,7 @@ def con() -> Iterator[duckdb.DuckDBPyConnection]: """Yield a connection over a once-materialized source table.""" # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows. - c = duckdb.connect() + c = duckdb.connect(config={"threads": 1}) # pin engine parallelism (INFRA-6); module-scoped source table c.execute( "CREATE TABLE src AS " f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)" @@ -50,6 +53,7 @@ def _query(n: int) -> str: return f"SELECT a, b, s FROM src LIMIT {n}" +@pytest.mark.gate # fetchall materializes n rows to Python -> binding-dominated; small-n end is the noise-free gate @pytest.mark.parametrize("n", LIMITS) def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: """Benchmark fetchall over a LIMIT n sweep.""" @@ -58,6 +62,7 @@ def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnect benchmark(lambda: con.execute(q).fetchall()) +@pytest.mark.gate # df() materializes n rows to numpy columns -> binding-dominated @pytest.mark.parametrize("n", LIMITS) def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: """Benchmark df() over a LIMIT n sweep.""" @@ -66,6 +71,7 @@ def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n benchmark(lambda: con.sql(q).df()) +@pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated @pytest.mark.parametrize("n", LIMITS) def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: """Benchmark to_arrow_table() over a LIMIT n sweep.""" diff --git a/benchmarks/test_concurrency_perf.py b/benchmarks/test_concurrency_perf.py new file mode 100644 index 00000000..8be28619 --- /dev/null +++ b/benchmarks/test_concurrency_perf.py @@ -0,0 +1,136 @@ +"""CodSpeed benchmark: concurrency / GIL pressure (COV-1). informational / WALLTIME. Standalone, not gated. + +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): + cd /Users/evert/projects/duckdb-python/wt-codspeed + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_concurrency_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +This is the ONE dimension the rest of the suite (single-threaded) cannot see: Python objects threading through +PARALLEL core execution. It varies `SET threads` and measures the binding under parallel scan / parallel UDF +invocation. All benchmarks are `informational` and their PRIMARY signal is LOCAL WALLTIME: + * scan benches -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup. + * native UDF -> ~flat scaling = the GIL tax on per-row Python calls (the engine scan is parallel + but the GIL serializes the calls). + * arrow (vectorized) UDF -> observed NEGATIVE scaling (slower with more threads): per-chunk convert + GIL + contention. A regression here would deepen the negative slope. + +Under the CI `-m informational` step these run in `simulation` (Callgrind), which SERIALIZES threads -- so the +wall-clock contention is NOT visible there; instead the deterministic instruction count captures the per-batch +Produce GIL calls and the UDF dispatch overhead. Never gated either way. + +GOTCHA (verified locally, mirrors the suite's other "measure the right thing" traps): a SINGLE-BATCH arrow table +does NOT parallelize (one batch = one serial scan unit; flat across threads). The arrow scan bench MUST use a +MULTI-BATCH table (`from_batches` with a modest chunksize) or it silently measures a serial scan. A CPU-heavy +aggregate is also required: a cheap sum is memory-bandwidth-bound and will not parallelize, so there is nothing +to contend on. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import duckdb +from duckdb.sqltypes import BIGINT + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + +pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") +import numpy as np # noqa: E402 (after importorskip, matching the suite convention) +import pandas as pd # noqa: E402 + +# informational: concurrency benchmarks are never gated (walltime-noisy; under Callgrind, thread-serialized). +pytestmark = pytest.mark.informational + +N_SCAN = 1_000_000 +BATCH = 20_000 # -> 50 record batches; MULTI-BATCH is required for the arrow scan to parallelize (see GOTCHA) +N_UDF_NATIVE = 200_000 # native UDF = one Python call per row; keep modest (Callgrind instruments every call) +N_UDF_ARROW = 1_000_000 # arrow UDF = one call per chunk (vectorized) +THREADS = [1, 4, 8] + +# CPU-heavy aggregate so the parallel scan actually engages worker threads (a cheap sum is bandwidth-bound and +# would not parallelize -> no contention to measure). The binding signal is the per-batch Produce GIL handoff. +HEAVY = "sin(a) * cos(b) + sqrt(abs(a)) + ln(abs(a) + 1)" + + +@pytest.fixture(scope="module") +def arrow_multibatch() -> pa.Table: + """Return a MULTI-batch arrow table (single-batch would scan serially -- see module GOTCHA).""" + a = pa.array(np.arange(N_SCAN), type=pa.int64()) + b = pa.array(np.arange(N_SCAN, dtype="float64") * 1.5, type=pa.float64()) + return pa.Table.from_batches(pa.table({"a": a, "b": b}).to_batches(max_chunksize=BATCH)) + + +@pytest.fixture(scope="module") +def pandas_frame() -> pd.DataFrame: + """Return a numpy-backed pandas frame (its scan parallelizes across worker threads).""" + return pd.DataFrame({"a": np.arange(N_SCAN), "b": np.arange(N_SCAN, dtype="float64") * 1.5}) + + +# --------------------------------------------------------------------------- # +# Parallel SCAN: Python objects (arrow batches / pandas chunks) pulled through the binding by engine worker +# threads under a CPU-heavy aggregate. The scan Produce acquires/releases the GIL per batch across threads. +# --------------------------------------------------------------------------- # + + +@pytest.mark.parametrize("threads", THREADS) +def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.Table, threads: int) -> None: + """Benchmark a parallel aggregate pulling arrow batches across threads.""" + con = duckdb.connect(config={"threads": threads}) + try: + con.register("t", arrow_multibatch) + q = f"SELECT sum({HEAVY}) FROM t" + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + finally: + con.close() + + +@pytest.mark.parametrize("threads", THREADS) +def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.DataFrame, threads: int) -> None: + """Benchmark a parallel aggregate pulling pandas chunks across threads.""" + con = duckdb.connect(config={"threads": threads}) + try: + con.register("t", pandas_frame) + q = f"SELECT sum({HEAVY}) FROM t" + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + finally: + con.close() + + +# --------------------------------------------------------------------------- # +# Parallel UDF: the engine scans a MATERIALIZED table (range() does not parallelize) and invokes a Python UDF +# from multiple worker threads. Native = per-row Python call under the GIL (GIL tax); arrow = per-chunk convert. +# --------------------------------------------------------------------------- # + + +@pytest.mark.parametrize("threads", THREADS) +def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None: + """Benchmark a native Python UDF invoked from parallel worker threads (GIL tax).""" + con = duckdb.connect(config={"threads": threads}) + try: + con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_NATIVE}) s(i)") # materialized -> parallel scan + con.create_function("pyf", lambda x: (x * 2 + 1) % 97, [BIGINT], BIGINT) + con.execute("SELECT sum(pyf(a)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(pyf(a)) FROM t").fetchall()) + finally: + con.close() + + +@pytest.mark.parametrize("threads", THREADS) +def test_udf_arrow_parallel(benchmark: BenchmarkFixture, threads: int) -> None: + """Benchmark a vectorized arrow UDF invoked from parallel worker threads.""" + con = duckdb.connect(config={"threads": threads}) + try: + con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_ARROW}) s(i)") # materialized -> parallel scan + con.create_function("af", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + con.execute("SELECT sum(af(a)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(af(a)) FROM t").fetchall()) + finally: + con.close() diff --git a/benchmarks/test_engine_control_perf.py b/benchmarks/test_engine_control_perf.py new file mode 100644 index 00000000..febd7ba4 --- /dev/null +++ b/benchmarks/test_engine_control_perf.py @@ -0,0 +1,68 @@ +"""CodSpeed benchmark: pure-ENGINE control (no Python egress). Standalone, not in CI's binding gate. + +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): + cd /Users/evert/projects/duckdb-python/wt-codspeed + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_engine_control_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +These `SELECT sum(...) FROM range(N)` queries aggregate to a single scalar, so the fetchall of the result is +negligible: they measure SQL compile + the engine aggregate with (almost) ZERO per-row Python egress. They are +the "engine floor" reference for MEAS-1: comparing a produce/fetch/ingest benchmark against the matching-N floor +here quantifies how much of that benchmark's cost is the binding vs the engine. They are `informational` (they +measure the engine, not the binding, so they must never gate). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +# informational: pure-engine reference, never gated. `con` fixture + threads=1 live in conftest.py. +pytestmark = pytest.mark.informational + +# Matched to the N of the fetch/produce/ingest/udf benchmarks so the floors line up for MEAS-1 subtraction and, +# at baseline regen, for the Option-B binding-fraction of the numeric-produce gates (see compare_baseline.py). +# CRITICAL: these floors go through scaled() with the SAME base N as the benchmarks they floor, so under +# BENCH_SCALE the floor and its benchmark stay at an identical N and the fraction stays valid. The 2048 small-N +# floor is NOT scaled (it is the fixed-cost baseline for the *_gate probes). +Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)" # small-N gate floor (compile-dominated), NOT scaled +Q_1C_100K = f"SELECT sum(i::BIGINT) FROM range({scaled(100_000)}) t(i)" # types-matrix numeric-df floor +Q_1C_200K = f"SELECT sum(i::BIGINT) FROM range({scaled(200_000)}) t(i)" # fetch / native-UDF floor +# produce/ingest floor +Q_2C_500K = ( + f"SELECT sum(a), sum(b) FROM (SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({scaled(500_000)}) t(i))" +) + + +def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: + con.execute(query).fetchall() # warm + benchmark(lambda: con.execute(query).fetchall()) + + +def test_engine_sum_1col_small(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Engine floor: compile + sum over range(2048), no egress.""" + _bench(benchmark, con, Q_1C_SMALL) + + +def test_engine_sum_1col_100k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Engine floor: compile + sum over range(100k), no egress.""" + _bench(benchmark, con, Q_1C_100K) + + +def test_engine_sum_1col_200k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Engine floor: compile + sum over range(200k), no egress.""" + _bench(benchmark, con, Q_1C_200K) + + +def test_engine_sum_2col_500k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + """Engine floor: compile + 2-col sum over range(500k), no egress.""" + _bench(benchmark, con, Q_2C_500K) diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py index 94a53c30..9820db6d 100644 --- a/benchmarks/test_fetch_perf.py +++ b/benchmarks/test_fetch_perf.py @@ -16,21 +16,23 @@ from typing import TYPE_CHECKING import pytest - -import duckdb +from _scale import scaled if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture + import duckdb -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() +# gate: OUT-row fetch fully materializes every row to Python -> binding-dominated, GIL-held; the engine side is +# a cheap range() scan. Deterministic under Callgrind -> instruction-count gate-able. (The small-N *_gate tests +# are the compile+fetch fixed-cost variants; see MEAS-1.) The `con` fixture + threads=1 live in conftest.py. +pytestmark = pytest.mark.gate + +# env-gated row counts (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. The 2048 +# small-N *_gate probes are intentionally NOT scaled (they are the compile+fetch fixed-cost baseline). +N_ROW = scaled(200_000) # per-row-object numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128) +N_STR = scaled(100_000) # varchar/blob/mixed-wide/timestamptz + fetchone/fetchmany loops +N_NEST = scaled(50_000) # heterogeneous scalar/list/struct row def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: @@ -40,41 +42,41 @@ def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a single BIGINT column.""" - _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(200000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a FROM range({N_ROW}) t(i)") def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a single INTEGER column.""" - _bench_fetchall(benchmark, con, "SELECT (i % 100)::INTEGER AS a FROM range(200000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT (i % 100)::INTEGER AS a FROM range({N_ROW}) t(i)") def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a single DOUBLE column.""" - _bench_fetchall(benchmark, con, "SELECT (i * 1.5)::DOUBLE AS a FROM range(200000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT (i * 1.5)::DOUBLE AS a FROM range({N_ROW}) t(i)") def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of two BIGINT columns.""" - _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(200000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range({N_ROW}) t(i)") def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a single VARCHAR column.""" - _bench_fetchall(benchmark, con, "SELECT ('str_value_' || i) AS s FROM range(100000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT ('str_value_' || i) AS s FROM range({N_STR}) t(i)") def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a mixed scalar/list/struct row.""" query = ( "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, " - "{'a': i, 'b': i + 1} AS st FROM range(50000) t(i)" + f"{{'a': i, 'b': i + 1}} AS st FROM range({N_NEST}) t(i)" ) _bench_fetchall(benchmark, con, query) def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark iterating a result one row at a time with fetchone.""" - query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)" + query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)" def run() -> None: rel = con.execute(query) @@ -85,43 +87,45 @@ def run() -> None: # --------------------------------------------------------------------------- # -# ADDED: small-N instruction-count-gate variants (the narrow-numeric fixed-cost path, noise-free at range(2048) -# under simulation mode in CI), expensive scalar OUT-row types (timestamptz pytz-per-row, blob, null-heavy), a -# heterogeneous per-cell-dispatch row (hugeint+uuid+decimal128+varchar, distinct from homogeneous columns), and -# the batched fetchmany loop. +# small-N COMPILE+FETCH FIXED-COST variants: at range(2048) the measured region is dominated by SQL front-end +# compilation + the engine, NOT fetch. MEAS-1 walltime split (vs the range(2048) engine floor in +# test_engine_control_perf.py): ~40% fetch fixed-cost, ~60% compile+engine. They still catch a fixed-cost +# regression, but they are compile+fetch fixed-cost gates, not pure-fetch gates. Plus expensive scalar OUT-row +# types (timestamptz pytz-per-row, blob, null-heavy), a heterogeneous per-cell-dispatch row +# (hugeint+uuid+decimal128+varchar, distinct from the homogeneous columns), and the batched fetchmany loop. # --------------------------------------------------------------------------- # def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark the small-N BIGINT instruction-count gate.""" + """Benchmark the small-N BIGINT compile+fetch fixed-cost (MEAS-1: ~60% compile+engine, ~40% fetch).""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)") def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark the small-N two-BIGINT instruction-count gate.""" + """Benchmark the small-N two-BIGINT compile+fetch fixed-cost.""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)") def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a half-NULL BIGINT column.""" - _bench_fetchall(benchmark, con, "SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range(200000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range({N_ROW}) t(i)") def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a TIMESTAMPTZ column.""" _bench_fetchall( - benchmark, con, "SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range(100000) t(i)" + benchmark, con, f"SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range({N_STR}) t(i)" ) def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a 128-bit DECIMAL column.""" - _bench_fetchall(benchmark, con, "SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range(200000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range({N_ROW}) t(i)") def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchall of a BLOB column.""" - _bench_fetchall(benchmark, con, "SELECT ('blob_value_' || i)::BLOB FROM range(100000) t(i)") + _bench_fetchall(benchmark, con, f"SELECT ('blob_value_' || i)::BLOB FROM range({N_STR}) t(i)") def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: @@ -130,14 +134,14 @@ def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyCo # from the homogeneous single-type columns above) query = ( "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, " - "((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range(100000) t(i)" + f"((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range({N_STR}) t(i)" ) _bench_fetchall(benchmark, con, query) def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark draining a result with batched fetchmany.""" - query = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(100000) t(i)" + query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)" def run() -> None: rel = con.execute(query) @@ -147,17 +151,3 @@ def run() -> None: break benchmark(run) - - -def test_expr_many(benchmark: BenchmarkFixture) -> None: - """Benchmark building many column/constant expressions.""" - - def run() -> int: - out = [] - for i in range(2000): - col = duckdb.ColumnExpression(f"col_{i}") - const = duckdb.ConstantExpression(i) - out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}")) - return len(out) - - benchmark(run) diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py index e3f232cc..c54ddba7 100644 --- a/benchmarks/test_ingest_native_perf.py +++ b/benchmarks/test_ingest_native_perf.py @@ -17,24 +17,21 @@ from typing import TYPE_CHECKING import pytest - -import duckdb +from _scale import scaled if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture -EXECMANY_N = 20_000 # executemany re-binds + executes per row, keep moderate -WIDE_N = 10_000 # values() builds a 1-row x N-col relation; cap N so the binder stays sane + import duckdb +# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. +EXECMANY_N = scaled(20_000) # executemany re-binds + executes per row, keep moderate +WIDE_N = scaled(10_000) # values() builds a 1-row x N-col relation; cap N so the binder stays sane -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() +# gate: native ingest eagerly transforms every cell (TransformPythonValue) / re-binds per row (executemany); +# the engine side (a trivial INSERT or a 1-row-wide fetchall drain) is negligible -> binding-dominated, GIL-held, +# deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py. +pytestmark = pytest.mark.gate @pytest.fixture(scope="module") diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py index abbe2a4d..73b99d0d 100644 --- a/benchmarks/test_ingest_numpy_perf.py +++ b/benchmarks/test_ingest_numpy_perf.py @@ -18,20 +18,21 @@ from typing import TYPE_CHECKING import pytest +from _scale import scaled -import duckdb import numpy as np import pandas as pd if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture -N = 500_000 -ANALYZER_N = 200_000 + import duckdb + +# env-gated (INFRA-4): scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below. +N = scaled(500_000) +ANALYZER_N = scaled(200_000) -# Module-global for the replacement-scan-from-variable path (frame resolution finds f_globals reliably). +# Registered explicitly via con.register (MEAS-3) rather than resolved by replacement-scan frame inspection. NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5} # Mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only). @@ -42,12 +43,9 @@ _MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)] -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() +# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()`/`sum(length())` over a registered +# frame) are engine-aggregate dominated -> informational. The analyzer BIND (count(*), no scan) is a pure +# per-bind binding cost -> gate. @pytest.fixture(scope="module") @@ -84,32 +82,42 @@ def df_object_mixed_types() -> pd.DataFrame: # --------------------------------------------------------------------------- # +@pytest.mark.informational def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark scanning a numpy-dict replacement scan.""" - benchmark(lambda: con.sql("SELECT sum(a), sum(b) FROM NPDICT").fetchall()) + """Benchmark scanning a registered numpy dict-of-arrays.""" + # MEAS-3: register explicitly (not frame-inspection replacement scan) and warm the query before measuring. + con.register("npdict", NPDICT) + con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall()) +@pytest.mark.informational def test_read_numpy_double_with_nan( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame ) -> None: """Benchmark scanning a numpy double column with NaNs.""" con.register("t", df_double_with_nan) + con.execute("SELECT sum(a) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) +@pytest.mark.informational def test_read_numpy_masked_int( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame ) -> None: """Benchmark scanning a masked nullable-int column.""" con.register("t", df_masked_int) + con.execute("SELECT sum(a) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) +@pytest.mark.informational def test_read_numpy_object_string_mixed( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame ) -> None: """Benchmark scanning a mixed object-string column.""" con.register("t", df_object_string_mixed) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) @@ -119,6 +127,7 @@ def test_read_numpy_object_string_mixed( # --------------------------------------------------------------------------- # +@pytest.mark.gate # count(*) forces no scan -> the measured cost is the PandasAnalyzer per-bind sampling (binding) def test_bind_analyzer_object( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame ) -> None: diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py index 1a4c09f0..168f1a3d 100644 --- a/benchmarks/test_pandas_perf.py +++ b/benchmarks/test_pandas_perf.py @@ -18,28 +18,24 @@ import pyarrow as pa import pytest +from _scale import scaled -import duckdb import numpy as np import pandas as pd if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture -N = 500_000 -WRITE_Q_NUM = "SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range(500000) t(i)" -WRITE_Q_STR = "SELECT ('str_value_' || i) AS s FROM range(500000) t(i)" -_STRINGS = [f"str_value_{i}" for i in range(N)] + import duckdb +N = scaled(500_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) +WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" +WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" +_STRINGS = [f"str_value_{i}" for i in range(N)] -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() +# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()` over a registered frame) are +# engine-aggregate dominated -> informational. Only the NUMPY-backed df() WRITE is binding-dominated -> gate. +# The arrow-backed WRITE goes through to_arrow_table().to_pandas() (pyarrow library code, MEAS-2) -> informational. @pytest.fixture(scope="module") @@ -77,35 +73,43 @@ def df_arrow_string() -> pd.DataFrame: # --------------------------------------------------------------------------- # +@pytest.mark.informational def test_read_pandas_numpy_numeric( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame ) -> None: """Benchmark scanning a numpy-backed numeric frame.""" con.register("t", df_numpy_numeric) + con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) +@pytest.mark.informational def test_read_pandas_numpy_string( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame ) -> None: """Benchmark scanning a numpy-backed string frame.""" con.register("t", df_numpy_string) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) +@pytest.mark.informational def test_read_pandas_arrow_numeric( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame ) -> None: """Benchmark scanning an arrow-backed numeric frame.""" con.register("t", df_arrow_numeric) + con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) +@pytest.mark.informational def test_read_pandas_arrow_string( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame ) -> None: """Benchmark scanning an arrow-backed string frame.""" con.register("t", df_arrow_string) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm (MEAS-3) benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) @@ -116,11 +120,13 @@ def test_read_pandas_arrow_string( # --------------------------------------------------------------------------- # +@pytest.mark.gate def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark materializing a numeric result to a numpy-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).df()) +@pytest.mark.gate def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark materializing a string result to a numpy-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).df()) @@ -131,26 +137,30 @@ def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.Duck # datetime column (TimestampConvert + ConvertDateTimeTypes). +@pytest.mark.gate def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark materializing a null-heavy numeric result to a numpy-backed frame.""" q = ( "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " - "CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range(500000) t(i)" + f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)" ) benchmark(lambda: con.sql(q).df()) +@pytest.mark.gate def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark materializing a timestamp result to a numpy-backed frame.""" - q = "SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range(500000) t(i)" + q = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)" benchmark(lambda: con.sql(q).df()) +@pytest.mark.informational # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2) def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark materializing a numeric result to an arrow-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) +@pytest.mark.informational # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2) def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark materializing a string result to an arrow-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py index 5ad56254..eb54f91c 100644 --- a/benchmarks/test_produce_numpy_perf.py +++ b/benchmarks/test_produce_numpy_perf.py @@ -20,17 +20,16 @@ from typing import TYPE_CHECKING import pytest +from _scale import scaled import duckdb import numpy as np # noqa: F401 (pinned identically A/B; imported so the env matches the other modules) if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture -N = 500_000 -TYPE_N = 200_000 # wide-internal types (hugeint/uuid/decimal128) are heavier per cell +N = scaled(500_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) +TYPE_N = scaled(200_000) # wide-internal types (hugeint/uuid/decimal128) are heavier per cell Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" Q_NUM_NULLS = ( @@ -44,14 +43,8 @@ Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)" -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() - - +# gate: df()/fetchnumpy() fully materialize numpy-backed columns -> binding-dominated (ArrayWrapper fill), +# GIL-held, deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py. def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.sql(query).df() # warm benchmark(lambda: con.sql(query).df()) @@ -67,37 +60,44 @@ def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, qu # --------------------------------------------------------------------------- # +@pytest.mark.gate def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a numeric result.""" _bench_df(benchmark, con, Q_NUM) +@pytest.mark.gate def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a null-heavy numeric result.""" # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch) _bench_df(benchmark, con, Q_NUM_NULLS) +@pytest.mark.gate def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a string result.""" _bench_df(benchmark, con, Q_STR) +@pytest.mark.gate def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a timestamp result.""" _bench_df(benchmark, con, Q_TS) +@pytest.mark.gate def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a hugeint result.""" _bench_df(benchmark, con, Q_HUGEINT) +@pytest.mark.gate def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a uuid result.""" _bench_df(benchmark, con, Q_UUID) +@pytest.mark.gate def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark df() of a 128-bit decimal result.""" _bench_df(benchmark, con, Q_DEC128) @@ -108,11 +108,13 @@ def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnecti # --------------------------------------------------------------------------- # +@pytest.mark.gate def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchnumpy() of a numeric result.""" _bench_numpy(benchmark, con, Q_NUM) +@pytest.mark.gate def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark fetchnumpy() of a null-heavy numeric result.""" _bench_numpy(benchmark, con, Q_NUM_NULLS) @@ -123,6 +125,7 @@ def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb. # --------------------------------------------------------------------------- # +@pytest.mark.informational # per-chunk streaming drain (GIL-per-chunk) -> walltime-informational, not gated def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark draining a result with fetch_df_chunk().""" @@ -145,6 +148,7 @@ def run() -> int: # --------------------------------------------------------------------------- # +@pytest.mark.informational # torch is local-only (importorskip -> skipped in CI); torch lib work dilutes it def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark torch() of a numeric result (skipped if torch is absent).""" pytest.importorskip("torch") @@ -172,7 +176,7 @@ def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnecti def test_mem_df_with_nulls() -> None: """Guard the Python-tracked peak allocation of a null-heavy df() call.""" - con = duckdb.connect() + con = duckdb.connect(config={"threads": 1}) try: tracemalloc.start() warm = con.sql(Q_NUM_NULLS).df() # populate one-time import / type caches diff --git a/benchmarks/test_relational_construction_perf.py b/benchmarks/test_relational_construction_perf.py new file mode 100644 index 00000000..5b386da5 --- /dev/null +++ b/benchmarks/test_relational_construction_perf.py @@ -0,0 +1,43 @@ +"""CodSpeed benchmark: relational-API expression construction. Standalone, not in CI's binding gate. + +A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): + cd /Users/evert/projects/duckdb-python/wt-codspeed + for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ + $P -m pytest benchmarks/test_relational_construction_perf.py \ + --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ + done + +SCOPE: this is relational-API *construction* (ColumnExpression / ConstantExpression / operator overloads), +NOT the binding-pressure surface the rest of the suite targets. It was moved here out of test_fetch_perf.py +(MEAS-5) because it is out of scope for the binding-pressure gate. It is KEPT because it carries a real signal +(a measured ~35% expression-construction delta at the cutover), so it stays visible -- but it is marked +`informational`, so it runs and reports and is NEVER part of the gate. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import duckdb + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + +# informational: relational-API construction, deliberately excluded from the binding-pressure gate (MEAS-5). +pytestmark = pytest.mark.informational + + +def test_expr_many(benchmark: BenchmarkFixture) -> None: + """Benchmark building many column/constant expressions.""" + + def run() -> int: + out = [] + for i in range(2000): + col = duckdb.ColumnExpression(f"col_{i}") + const = duckdb.ConstantExpression(i) + out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}")) + return len(out) + + benchmark(run) diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py index 7fb80c4b..f0323fea 100644 --- a/benchmarks/test_types_roundtrip_perf.py +++ b/benchmarks/test_types_roundtrip_perf.py @@ -18,15 +18,14 @@ from typing import TYPE_CHECKING import pytest - -import duckdb +from _scale import scaled if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture -N = 100_000 + import duckdb + +N = scaled(100_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) # one logical type per column; long-varchar is intentionally > 64 chars TYPE_EXPR = { @@ -34,6 +33,8 @@ "double": "(i * 1.5)::DOUBLE", "varchar_short": "('str_' || i)", "varchar_long": "('row_' || i || '_' || repeat('payload ', 9))", + "date": "DATE '2020-01-01' + (i % 3650)::INTEGER", + "bool": "(i % 2 = 0)", "timestamp": "TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND)", "decimal64": "((i::DECIMAL(18, 3)) / 1000)", "decimal128": "((i * 1.5)::DECIMAL(28, 6))", @@ -45,18 +46,12 @@ TYPES = list(TYPE_EXPR) -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() - - +# `con` fixture + threads=1 live in conftest.py. def _query(type_name: str) -> str: return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)" +@pytest.mark.gate # OUT-row fetchall -> binding-dominated per-type dispatch @pytest.mark.parametrize("type_name", TYPES) def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: """Benchmark fetchall of one logical type per column.""" @@ -65,6 +60,7 @@ def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConne benchmark(lambda: con.execute(q).fetchall()) +@pytest.mark.gate # OUT-col df() -> binding-dominated ArrayWrapper fill per type @pytest.mark.parametrize("type_name", TYPES) def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: """Benchmark df() of one logical type per column.""" @@ -73,6 +69,7 @@ def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, benchmark(lambda: con.sql(q).df()) +@pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated @pytest.mark.parametrize("type_name", TYPES) def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: """Benchmark to_arrow_table() of one logical type per column (informational only).""" diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py index 34896bcc..a62be815 100644 --- a/benchmarks/test_udf_perf.py +++ b/benchmarks/test_udf_perf.py @@ -16,30 +16,24 @@ from typing import TYPE_CHECKING import pytest +from _scale import scaled -import duckdb from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR if TYPE_CHECKING: - from collections.abc import Iterator - from pytest_codspeed import BenchmarkFixture + import duckdb + pa = pytest.importorskip("pyarrow") pc = pytest.importorskip("pyarrow.compute") -NATIVE_N = 200_000 # native = one Python call per row, keep moderate -ARROW_N = 1_000_000 # arrow = one Python call per chunk (vectorized), can be large - - -@pytest.fixture -def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a fresh connection, closed on teardown.""" - c = duckdb.connect() - yield c - c.close() +# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. +NATIVE_N = scaled(200_000) # native = one Python call per row, keep moderate +ARROW_N = scaled(1_000_000) # arrow = one Python call per chunk (vectorized), can be large +# `con` fixture + threads=1 live in conftest.py. def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.execute(query).fetchall() # warm the engine + import caches before measuring benchmark(lambda: con.execute(query).fetchall()) @@ -50,24 +44,28 @@ def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: s # --------------------------------------------------------------------------- # +@pytest.mark.gate # native scalar UDF: one Python call per row dominates; the sum() consume is negligible def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark a 1-arg native int scalar UDF.""" con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)") +@pytest.mark.gate def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark a 2-arg native int scalar UDF.""" con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT) _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)") +@pytest.mark.gate def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark a 1-arg native double scalar UDF.""" con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE) _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)") +@pytest.mark.gate def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark a native string scalar UDF.""" con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR) @@ -78,6 +76,7 @@ def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConn ) +@pytest.mark.gate def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark the validity short-circuit for NULL inputs to a native UDF.""" # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the @@ -96,18 +95,21 @@ def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBP # --------------------------------------------------------------------------- # +@pytest.mark.informational # vectorized arrow UDF: pyarrow.compute lib work + per-chunk conversion + 1M engine def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark a vectorized arrow int UDF.""" con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)") +@pytest.mark.informational def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark a vectorized arrow double UDF.""" con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow") _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)") +@pytest.mark.informational def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: """Benchmark the selvec compaction for NULL inputs to a vectorized arrow UDF.""" # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call diff --git a/pyproject.toml b/pyproject.toml index fd0ef328..90218094 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -281,6 +281,22 @@ test = [ # dependencies used for running tests "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'", "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'", ] +bench = [ # minimal, pinned deps for the CodSpeed benchmark suite (.github/workflows/codspeed.yml). Deliberately + # NOT the heavy `test` group (no torch/tensorflow/pyspark/adbc). Pinned via uv.lock and kept in lockstep + # with any baseline compared against, so the only cross-run delta is the binding. Constraints mirror the + # `test` group so the lockfile resolves identically. torch/tf produce paths are local-only (importorskip). + "pytest", + "pytest_codspeed", + "polars>=1.33.0", + "pytz", + "numpy<2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version < '3.12'", + "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'", + "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'", + "pandas>=3.0.0; python_version > '3.10'", + "pandas<3.0.0; python_version < '3.11'", + "pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", + "pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", +] scripts = [ # dependencies used for running scripts "cxxheaderparser", "ipython", diff --git a/tests/fast/test_binding_pressure_leak.py b/tests/fast/test_binding_pressure_leak.py new file mode 100644 index 00000000..22de87b2 --- /dev/null +++ b/tests/fast/test_binding_pressure_leak.py @@ -0,0 +1,113 @@ +"""Sustained-iteration leak guards for the binding object-pinning paths (COV-3). + +Sibling of test_relation_dependency_leak.py. CodSpeed measures steady-state PER-CALL cost and structurally cannot +see a per-call refcount imbalance in the object-pinning graph (ExternalDependency / registered_objects / a UDF's +retained Python callable) until it OOMs. This is a plain assertion test (NOT a codspeed benchmark, no marker): it +runs each pinning path N times and asserts RSS and Python-object growth stay flat. + +Covers the paths the existing leak test does not: register/unregister, native + arrow UDF create/run/remove, and +executemany. (from_arrow/from_df/replacement-scan pinning is already covered by test_relation_dependency_leak.py.) +""" + +import gc +import os + +import pytest + +import numpy as np +import pandas as pd + +try: + import pyarrow as pa + + can_arrow = True +except ImportError: + can_arrow = False + +from duckdb.sqltypes import BIGINT + +psutil = pytest.importorskip("psutil") + +ITERS = 100 +ROWS = 100_000 +_EM_ROWS = [(i, i * 1.5, f"s{i}") for i in range(5_000)] + + +def _rss_gb(): + return psutil.Process(os.getpid()).memory_info().rss / (10**9) + + +def check_flat(fn, cursor, iters=ITERS, obj_slack=20_000): + """Assert RSS and tracked-object count stay flat across `iters` calls of `fn`.""" + fn(cursor) # warm one-time caches so they are not counted as growth + gc.collect() + start_rss = _rss_gb() + start_obj = len(gc.get_objects()) + for _ in range(iters): + fn(cursor) + gc.collect() + end_rss = _rss_gb() + end_obj = len(gc.get_objects()) + # RSS ratio bound mirrors test_relation_dependency_leak.py (growth must stay well under 3x)... + assert end_rss / 3 < start_rss, f"RSS grew {start_rss:.3f} -> {end_rss:.3f} GB over {iters} iters" + # ...plus an object-count bound, which catches a Python-object pin that is too small to move RSS. + assert end_obj - start_obj < obj_slack, f"tracked objects grew by {end_obj - start_obj} over {iters} iters" + + +# --------------------------------------------------------------------------- # +# Pinning paths (one full pin/unpin cycle per call). +# --------------------------------------------------------------------------- # + + +def register_unregister_arrow(cursor): + tbl = pa.table({"a": pa.array(np.arange(ROWS), type=pa.int64())}) + cursor.register("t_reg", tbl) + cursor.execute("SELECT sum(a) FROM t_reg").fetchall() + cursor.unregister("t_reg") + + +def register_unregister_pandas(cursor): + df = pd.DataFrame({"a": np.arange(ROWS)}) + cursor.register("t_reg", df) + cursor.execute("SELECT sum(a) FROM t_reg").fetchall() + cursor.unregister("t_reg") + + +def native_udf_cycle(cursor): + cursor.create_function("f_leak", lambda x: x + 1, [BIGINT], BIGINT) + cursor.execute("SELECT sum(f_leak(i::BIGINT)) FROM range(10000) t(i)").fetchall() + cursor.remove_function("f_leak") + + +def arrow_udf_cycle(cursor): + import pyarrow.compute as pc + + cursor.create_function("af_leak", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + cursor.execute("SELECT sum(af_leak(i::BIGINT)) FROM range(50000) t(i)").fetchall() + cursor.remove_function("af_leak") + + +def executemany_cycle(cursor): + cursor.execute("CREATE OR REPLACE TABLE t_em (a BIGINT, b DOUBLE, c VARCHAR)") + cursor.executemany("INSERT INTO t_em VALUES (?, ?, ?)", _EM_ROWS) + + +class TestBindingPressureLeak: + def test_register_unregister_arrow_leak(self, duckdb_cursor): + if not can_arrow: + pytest.skip("pyarrow not installed") + check_flat(register_unregister_arrow, duckdb_cursor) + + def test_register_unregister_pandas_leak(self, duckdb_cursor): + check_flat(register_unregister_pandas, duckdb_cursor) + + def test_native_udf_cycle_leak(self, duckdb_cursor): + check_flat(native_udf_cycle, duckdb_cursor) + + def test_arrow_udf_cycle_leak(self, duckdb_cursor): + if not can_arrow: + pytest.skip("pyarrow not installed") + check_flat(arrow_udf_cycle, duckdb_cursor) + + def test_executemany_leak(self, duckdb_cursor): + check_flat(executemany_cycle, duckdb_cursor) From 090e02142b1bca4163c526ad75a4dcc84a5ae374 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Thu, 2 Jul 2026 07:39:17 +0200 Subject: [PATCH 5/7] less text --- .github/workflows/codspeed.yml | 65 ++-- benchmarks/PLAN.md | 290 +++++------------- benchmarks/README.md | 34 ++ benchmarks/_scale.py | 23 +- benchmarks/compare_baseline.py | 66 ++-- benchmarks/conftest.py | 37 +-- benchmarks/requirements-bench.txt | 12 +- benchmarks/test_arrow_perf.py | 63 +--- benchmarks/test_cardinality_perf.py | 34 +- benchmarks/test_concurrency_perf.py | 63 ++-- benchmarks/test_engine_control_perf.py | 36 +-- benchmarks/test_fetch_perf.py | 53 +--- benchmarks/test_ingest_native_perf.py | 29 +- benchmarks/test_ingest_numpy_perf.py | 62 ++-- benchmarks/test_pandas_perf.py | 67 +--- benchmarks/test_produce_numpy_perf.py | 86 ++---- .../test_relational_construction_perf.py | 22 +- benchmarks/test_types_roundtrip_perf.py | 30 +- benchmarks/test_udf_perf.py | 46 +-- pyproject.toml | 10 +- tests/fast/test_binding_pressure_leak.py | 12 +- 21 files changed, 337 insertions(+), 803 deletions(-) create mode 100644 benchmarks/README.md diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 2b7b1664..b80323fc 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -1,30 +1,18 @@ -# Perf-regression benchmarks: instruction-count (Callgrind) gating against a COMMITTED baseline. +# Instruction-count (Callgrind) perf-regression gate against a COMMITTED baseline. No CodSpeed account/token/runner: +# compare_baseline.py parses raw callgrind dumps and diffs each benchmark against benchmarks/baseline.json. Counts +# are near-deterministic with PYTHONHASHSEED pinned (~0.1% noise), so the 5% gate threshold sits far above it. +# Details + rationale: benchmarks/README.md and benchmarks/PLAN.md. # -# NO CodSpeed account/dashboard/token/runner. pytest-codspeed's hooks call callgrind_dump_stats_at() per -# benchmark, so a self-hosted `valgrind --tool=callgrind` run writes one dump per benchmark, headed by -# `desc: Trigger: Client Request: ` with the count on `totals:` (events: Ir). benchmarks/compare_baseline.py -# parses those dumps and diffs each benchmark against benchmarks/baseline.json (the committed instruction-count -# baseline). Counts are near-deterministic under Callgrind with PYTHONHASHSEED pinned (~0.1% noise observed; -# often bit-identical), so a 5% default gate threshold sits far above noise. Validated on a Linux+valgrind box. +# Triggers: nightly schedule + manual workflow_dispatch (no pull_request/push). A dispatch on a feature branch +# compares that branch's counts vs the baseline.json committed on it, answering "did my branch regress vs main". # -# TRIGGERS: nightly `schedule` + manual `workflow_dispatch`. No pull_request/push (and no `paths:` -- neither -# schedule nor dispatch honors it). A dispatch on a feature branch compares that branch's benchmark counts vs the -# baseline.json committed on the branch (i.e. main's baseline), answering "did my branch regress vs main". +# Modes (workflow_dispatch input `regen`): +# regen=false (default) -> COMPARE + report. Report-only for now (never fails); flip to --enforce once trusted. +# regen=true -> write a fresh baseline.json + upload as an artifact to commit deliberately. Bump +# requirements-bench.txt FIRST (separate commit) if the pins should change. # -# MODES (workflow_dispatch input `regen`): -# regen=false (default) -> COMPARE: run + diff vs baseline.json, print a report. REPORT-ONLY for now (never -# fails the job); flip compare_baseline.py to --enforce once trusted. -# regen=true -> REGENERATE: run + write a fresh baseline.json (per-bench counts + provenance meta + -# Option-B binding fractions/auto-move) and upload it as an artifact to commit -# deliberately. Bump benchmarks/requirements-bench.txt in a separate commit FIRST if -# the pins should change, then regen so the baseline matches the committed pins. -# -# The concurrency module is EXCLUDED from the Callgrind sweep: Callgrind serializes threads, so its signal -# (wall-clock GIL contention) is meaningless here; it stays a local walltime tool. -# -# MEMORY MODE (a second Callgrind sweep for O(rows) produce peak-RSS) is DESIGNED but DEFERRED -- see PLAN.md. -# -# Valgrind is slow (~20-50x); timeout-minutes is a conservative guess -- calibrate after the first CI run. +# The concurrency module is excluded from the sweep (Callgrind serializes threads, so its signal is meaningless). +# Memory mode (a second sweep for produce peak-RSS) is deferred (see PLAN.md). name: Benchmarks @@ -45,17 +33,14 @@ concurrency: jobs: benchmarks: runs-on: ubuntu-latest - timeout-minutes: 90 # measured: ~25 min Callgrind sweep at BENCH_SCALE=10 (12-core Linux) + cold build ~10 min; margin for CI + timeout-minutes: 90 # ~25 min sweep at BENCH_SCALE=10 (12-core Linux) + ~10 min cold build; margin for CI permissions: contents: read env: - PYTHONHASHSEED: "0" # pin hash randomization so dict/struct paths give stable instruction counts (INFRA-6) - CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks (the callgrind_dump_stats_at markers) - # env-gated row counts (INFRA-4): shrink the O(rows)/per-row-object benchmarks so the Callgrind sweep fits - # under timeout-minutes. Local runs leave this unset -> full N. Recorded in baseline.json meta.bench_scale; - # a baseline is only comparable to a run at the SAME scale. Calibrated on a 12-core Linux+valgrind box: - # BENCH_SCALE=10 -> ~25 min full sweep, and the Option-B move-list matches full-N (fractions shift slightly - # but stay the same side of the cutoff). Most benches floor at 20k rows (_scale.FLOOR), still row-dominated. + PYTHONHASHSEED: "0" # stable instruction counts for dict/struct paths + CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks + # shrink the O(rows) benches so the sweep fits under timeout-minutes. Local runs leave this unset -> full N. + # Recorded in baseline.json meta.bench_scale; a baseline only compares to a run at the SAME scale. BENCH_SCALE: "10" steps: - uses: actions/checkout@v4 @@ -95,22 +80,19 @@ jobs: run: | # step 1: build deps only (needed for --no-build-isolation), no project uv sync --only-group build --no-install-project -p 3.13 - # step 2: build+install the project (release) + build group, WITHOUT the heavy default `dev` group - # (torch/tensorflow/pyspark). uv.lock is gitignored, so it is deliberately NOT relied on for bench deps. + # step 2: build+install the project (release) + build group, without the heavy default `dev` group uv sync --no-build-isolation --no-editable --reinstall --no-default-groups --group build -p 3.13 - # step 3: install the FROZEN, committed bench pins (exact ==). Regenerated deliberately with the baseline - # (source list: pyproject [dependency-groups] bench), so the only cross-run delta is the binding. + # step 3: the frozen bench pins (exact ==), so the only cross-run delta is the binding uv pip install -r benchmarks/requirements-bench.txt - name: Collect gate node-ids - # the gate/informational split (conftest markers) classifies which benchmarks are gate-able; regen uses it + # the gate/informational marker split; regen uses it to classify each benchmark run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \ | grep '::' > gate_list.txt || true - name: Run benchmarks under Callgrind (per-benchmark instruction counts) - # ONE sweep over all gate+informational benchmarks EXCEPT the concurrency module (Callgrind serializes - # threads -> its wall-clock signal is meaningless and it is expensive). Each benchmark emits a callgrind - # dump keyed by its uri. The pytest-codspeed hooks obj-skip libpython, so counts are clean. + # ONE sweep over gate+informational EXCEPT the concurrency module (thread-serialized, expensive). Each + # benchmark emits a callgrind dump keyed by its uri. run: | mkdir -p profiles CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \ @@ -121,8 +103,7 @@ jobs: - name: Compare against committed baseline (report-only) if: ${{ !inputs.regen }} - # report-only for now: prints the per-benchmark delta table and NEVER fails the job. Add --enforce here - # once trusted to fail on a gate regression (informational benches never fail). + # report-only: prints the delta table, never fails the job. Add --enforce once trusted. run: | uv run --no-sync python benchmarks/compare_baseline.py compare \ --profiles profiles --baseline benchmarks/baseline.json \ diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md index 54786083..835aef7d 100644 --- a/benchmarks/PLAN.md +++ b/benchmarks/PLAN.md @@ -1,77 +1,40 @@ -# CodSpeed Benchmark Suite Plan — duckdb-python binding hot paths - -Grounded in the binding source on `perf/codspeed` (`src/`). File:line citations are to this tree. - -## 0. Conventions (from the existing 3 modules, keep these) - -- Function-scoped `con` fixture; module-scoped input-data fixtures. -- READ = `SELECT sum(col) / sum(length(col))` (never `count(*)`, which is answered from metadata). -- WRITE = eager materialize or fully drain the lazy reader. -- Warm the engine once (`con.execute(query).fetchall()`) before `benchmark(...)` so first-call import-cache population is not charged to the measured region. -- Pin numpy/pandas/pyarrow/polars identically across A/B so deltas are pure binding cost. - -Ranking: **P0** = on a known regression path or the cutover-reworked code (narrow-numeric common case); **P1** = high-traffic conversion / per-element Python work; **P2** = correctness-relevant, lower traffic or engine-dominated. - -## (a) Prioritized scenarios - -### PRODUCE (duckdb -> external) — highest regression risk - -Row path: `DuckDBPyResult::Fetchone` (`src/pyresult.cpp:126-151`) builds a `PyUtil::TupleBuilder` (`src/include/duckdb_python/pyutil.hpp:101-125`) per row and calls `PythonObject::FromValue` (`src/native/python_objects.cpp:474`) per cell. O(rows x cols). This is the shape of the historical ~15% fetchall regression. - -| # | Scenario | SQL / setup | Measures | Pri | -|---|----------|-------------|----------|-----| -| P0-1 | fetchall int64 1col | `SELECT i::BIGINT a FROM range(1_000_000)` | TupleBuilder + FromValue int (`python_objects.cpp:489`) | P0 | -| P0-2 | fetchall int 2-4col | `SELECT i::BIGINT,(i+1)::BIGINT,(i*2)::INTEGER FROM range(1_000_000)` | TupleBuilder scaling w/ col count | P0 | -| P0-3 | fetchall double | `SELECT (i*1.5)::DOUBLE FROM range(1_000_000)` | FromValue double | P0 | -| P0-4 | fetchall varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | FromValue VARCHAR string copy (`python_objects.cpp:515`) | P1 | -| P0-5 | fetchone loop (overhead) | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(100_000)` | per-call Fetchone + chunk-boundary FetchNext + GIL cycle | P0 | -| P0-6 | fetchmany batched | as P0-5, `fetchmany(10_000)` loop | Fetchmany loop | P1 | -| P1-7 | **df() numeric (reworked)** | `SELECT i::BIGINT,(i*1.5)::DOUBLE FROM range(1_000_000)` | FetchNumpyInternal -> ArrayWrapper ConvertColumnRegular, `HAS_NULLS=false/PANDAS=true` branch (`array_wrapper.cpp:415-425`) | P0 | -| P1-8 | **df() numeric WITH NULLS** | `SELECT CASE WHEN i%10=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | `HAS_NULLS=true` + masked_array build (`array_wrapper.cpp:743-757`) + masked->pd.NA rewrite (`pyresult.cpp:362-393`) | P0 | -| P1-9 | fetchnumpy numeric | as P1-7 | FetchNumpyInternal without the DataFrame wrap | P1 | -| P1-10 | df() varchar | `SELECT ('str_value_'||i) FROM range(500_000)` | StringConvert PyUnicode_FromStringAndSize per row (`array_wrapper.cpp:164-181`) | P1 | -| P1-11 | df() timestamp | `SELECT TIMESTAMP '2020-01-01'+(i*INTERVAL 1 SECOND) FROM range(1_000_000)` | TimestampConvertNano + ConvertDateTimeTypes (`pyresult.cpp:299`) | P1 | -| P1-13 | to_record_batch_reader drained | `range(1_000_000)`, `to_record_batch_reader(100_000)` | lazy stream (`pyresult.cpp:573`), iterate + sum num_rows | P1 | -| P2-15 | torch()/tf() numeric | `range(500_000)` | FetchNumpyInternal + per-col from_numpy (`pyresult.cpp:405-421`) | P2 | -| P2-16 | fetch_df_chunk | large query, loop `fetch_df_chunk()` | FetchDFChunk per chunk (`pyresult.cpp:400`) | P2 | -| P1-17 | fetchall LIST | `SELECT [i,i+1,i+2] FROM range(200_000)` | FromValue LIST recursion (`python_objects.cpp:651`) | P1 | -| P1-18 | fetchall STRUCT | `SELECT {'a':i,'b':i+1} FROM range(200_000)` | FromStruct dict build (`python_objects.cpp:390-414`) | P1 | -| P1-20 | fetchall DECIMAL | `SELECT (i::DECIMAL(18,3))/1000 FROM range(200_000)` | Python `Decimal()(val.ToString())` per row (`python_objects.cpp:507`) | P1 | -| P1-21 | fetchall TIMESTAMPTZ | `SELECT (TIMESTAMPTZ '2020-01-01'+(i*INTERVAL 1 SECOND)) FROM range(100_000)` | pytz localize+astimezone per row (`python_objects.cpp:567-573`) | P1 | -| P2-22 | fetchall NULL-heavy | `SELECT CASE WHEN i%2=0 THEN NULL ELSE i::BIGINT END FROM range(1_000_000)` | validity branch + nb::none (`pyresult.cpp:142`) | P2 | -| P2-23 | fetchall BLOB | `SELECT ('blob_'||i)::BLOB FROM range(200_000)` | nb::bytes (`python_objects.cpp:517`) | P2 | - -### INGEST (external -> duckdb) - -| # | Scenario | Setup | Path | Pri | -|---|----------|-------|------|-----| -| I0-1 | **pandas numpy int64/double** | DataFrame 1M | NumpyScan::Scan ScanNumpyMasked zero-copy when stride==sizeof(T); double NaN->NULL loop (`numpy_scan.cpp:76-112,236-246`) reworked | P0 | -| I0-2 | **pandas numpy object-string** | `pd.array(strings,dtype=object)` 500k | NumpyScan STRING/OBJECT: per-row isinstance, PyUnicodeIsCompactASCII zero-copy vs DecodePythonUnicode transcode (`numpy_scan.cpp:353-452`) reworked | P0 | -| I1-3 | pandas object bind-time analyzer | object col 100k+ | Pandas::Bind -> PandasAnalyzer::Analyze samples rows GetItemType ladder (`analyzer.cpp:356-460`). Per-BIND overhead, independent of rows (count(*) ok here) | P1 | -| I1-4 | pandas arrow-backed | pd.ArrowDtype 1M | ToArrowTable -> arrow scan (`pyconnection.cpp:1799`) | P1 | -| I0-5 | arrow Table | 1M | CreateArrowScan PythonTableArrowArrayStreamFactory near-zero-copy (`python_replacement_scan.cpp:55-83`) | P1 | -| I1-6 | arrow RecordBatchReader | from_batches | same factory, streaming (distinct from Table) | P1 | -| I1-7 | polars DataFrame | 1M | entry.to_arrow() one-time + arrow scan (`replacement_scan.cpp:150-156`) | P2 | -| I1-8 | numpy ndarray + dict-of-arrays | np.arange | replacement scan -> pandas_scan (`replacement_scan.cpp:163-200`) | P2 | -| I1-9 | **native values() list-of-tuples** | `con.values([(i,i*1.5,'s') for i in range(100_000)])` | Values -> TransformPythonValue per cell, GetPythonObjectType ladder (`python_conversion.cpp:402-454,1075`) | P1 | -| I1-10 | native list-of-dicts | list of dicts | TransformDictionaryToStruct recursion (`python_conversion.cpp:119`) | P2 | -| I1-11 | executemany params | INSERT ?,? 100k sets | ExecuteMany loop, TransformPythonValue per set (`pyconnection.cpp:500-544`) | P2 | -| I2-12 | read_parquet/csv/json | a file | arg marshal -> TableFunction under GIL-release; engine-dominated | P2 | - -### UDF (`src/python_udf.cpp`) — zero coverage today - -| # | Scenario | Setup | Path | Pri | -|---|----------|-------|------|-----| -| U0-1 | **scalar native 1 int arg** | `def f(x):return x+1`, `SELECT sum(f(i::BIGINT)) FROM range(1_000_000)` | per-row TupleBuilder args + PyObject_CallObject + TransformPythonObject result (`python_udf.cpp:320-384`) | P0 | -| U0-2 | scalar native 2-3 args | `def f(a,b):return a+b` 2 cols 1M | arg-tuple scaling | P1 | -| U1-3 | scalar native string | `def f(s):return s.upper()` 500k | VARCHAR in + string out | P1 | -| U1-4 | scalar native NULL inputs | 50% NULL, DEFAULT handling | SetNull short-circuit (`python_udf.cpp:340-350`) | P1 | -| U1-6 | **vectorized arrow UDF** | `type='arrow'` pc.add 1M | ConvertDataChunkToPyArrowTable + call + ConvertArrowTableToVector cast (`python_udf.cpp:33-144,225`) | P0 | -| U2-7 | vectorized NULL slicing | DEFAULT + nulls | selvec compaction/reconstruction (`python_udf.cpp:197-305`) | P2 | - -## (b) Type x direction matrix - -Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper), OUT-arrow. +# Benchmark suite plan + +Design rationale for the binding micro-benchmarks. The suite is implemented in `benchmarks/`; CI lives in +`../.github/workflows/codspeed.yml`; conventions, markers, and the two data-pattern traps are in +[README.md](README.md). + +Priority: **P0** = known-regression or cutover-reworked path (narrow-numeric common case); **P1** = high-traffic +conversion or per-element Python work; **P2** = correctness-relevant, lower-traffic or engine-dominated. + +## Scenarios + +PRODUCE (duckdb to Python) is the highest regression risk: `Fetchone` builds a `TupleBuilder` per row and calls +`FromValue` per cell (O(rows x cols), the shape of the historical ~15% fetchall regression). + +- **OUT-row** (`test_fetch_perf`, `test_types_roundtrip_perf`): fetchall / fetchone / fetchmany per type. P0 + narrow numeric; P1 varchar, list, struct, and the expensive per-row types (decimal `Decimal()`, timestamptz + pytz, hugeint string round-trip, uuid). Small-N `*_gate` probes isolate the compile+fetch fixed cost. +- **OUT-col** (`test_produce_numpy_perf`): df() / fetchnumpy() reworked columnar path. P0 numeric no-null vs + REAL-null (the masked_array branch); plus string, timestamp, and wide-internal (hugeint/uuid/decimal128). +- **OUT-arrow / polars** (`test_arrow_perf`): to_arrow_table / reader / pl(). Informational (engine-parallel, + GIL-released). +- **Cardinality** (`test_cardinality_perf`): a LIMIT-n sweep giving a clean per-row conversion slope. + +INGEST (Python to duckdb): + +- **numpy / pandas** (`test_ingest_numpy_perf`, `test_pandas_perf`): numpy-backed scan (NaN-to-NULL, masked), + object-string transcode ladder, arrow-backed zero-copy, and the per-bind PandasAnalyzer. +- **arrow** (`test_arrow_perf`): Table + RecordBatchReader + dictionary sweep. +- **native** (`test_ingest_native_perf`): values() list/tuple/dict per-cell TransformPythonValue, executemany. + +UDF (`test_udf_perf`, zero coverage before this suite): native scalar per-row (P0, the biggest untested per-call +path) and vectorized arrow per-chunk. + +## Type x direction matrix + +Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper), +OUT-arrow. | Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow | |------|-----------|----------|---------|---------|-----------| @@ -79,138 +42,49 @@ Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (Fro | double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 | | varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 | | bool | P2 | P1 | P2 | P1 | P2 | -| decimal | P2 | n/a | **P1** (Python Decimal) | P1 | P2 | +| decimal64/128 | P2 | n/a | **P1** (Python Decimal) | P1 | P2 | | date | P2 | P1 | P1 | P1 | P2 | -| timestamp | P1 | **P1** | P1 | P1 | P1 | -| timestamptz | P2 | P1 | **P1** (pytz/row) | P1 | P2 | -| time/interval | P2 | P1 | P1 | P1 | P2 | -| LIST/ARRAY | P2 | P2 | P1 (recursive) | P1 | P2 | -| STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 | -| MAP | P2 | P2 | P2 | P2 | P2 | -| blob | P2 | P2 | P2 | P2 | P2 | -| NULL-heavy | - | **P1** | P2 | **P0** (masked_array) | P1 | -| enum/category | - | P1 | P1 | P1 | P2 | - -Minimum viable to ship: int64, double, varchar, timestamp, decimal, LIST, STRUCT, NULL-heavy in OUT-row and OUT-col; int64/double/varchar in IN-numpy. - -## (c) Gaps vs the existing 3 modules - -Covered well: OUT-row narrow numeric, OUT-arrow/polars numeric+string, pandas IN/OUT numpy-vs-arrow numeric+string, fetchone-loop numeric. - -Missing: -1. **PRODUCE columnar reworked path under-covered** — df() only 500k, only numeric/string, never with NULLS (the masked-array branch is exactly what changed). Add df-with-nulls, fetchnumpy, df-timestamp. -2. **UDFs: zero coverage** — whole subsystem (python_udf.cpp), native per-row is the single biggest untested per-call-overhead path. Add U0-1/U0-2/U1-3/4/U1-6. -3. **Native Python ingest: zero coverage** — values()/list-of-tuples/list-of-dicts/executemany via TransformPythonValue. Add I1-9/10/11. -4. **Expensive scalar OUT-row types untested** — decimal, timestamptz, interval, isolated LIST/STRUCT/MAP. Add P1-17..21. -5. **Object-column bind-time analyzer untested** — PandasAnalyzer sampling, per-bind cost. Add I1-3. -6. **Size regimes thin** — add 1M throughput AND 1-row overhead variants. -7. **Arrow ingest only pa.table** — add RecordBatchReader, polars, numpy-ndarray ingest. -8. **NULL-heavy IN-numpy untested** (ScanNumpyMasked + ApplyMask). - -## (d) Suite organization + CodSpeed mechanics - -``` -benchmarks/ - test_fetch_perf.py # EXISTING — OUT-row. Add: nested, decimal, timestamptz, null-heavy, 1M+1-row - test_arrow_perf.py # EXISTING — add RecordBatchReader ingest, materialized vs stream - test_pandas_perf.py # EXISTING — add df()-with-nulls, datetime, fetchnumpy, analyzer bind - test_produce_numpy_perf.py # NEW — df()/fetchnumpy/fetch_df_chunk reworked columnar, per-type, null vs no-null - test_ingest_native_perf.py # NEW — values()/list-of-tuples/list-of-dicts/executemany - test_ingest_numpy_perf.py # NEW — numpy ndarray / object-string scan / analyzer bind - test_udf_perf.py # NEW — scalar native + vectorized arrow UDFs - test_types_roundtrip_perf.py # NEW — type x direction matrix sweep, parametrized -``` -One module per binding subsystem so a CodSpeed report points at one src/ area. torch/tf go in produce_numpy (wrap FetchNumpyInternal); polars stays in arrow (wraps FetchArrowTable). - -> **Note (reconciled to the implemented model).** The prose below originally described a per-PR CodSpeed -> commit-diff gate. That is NOT how the suite works now. The implemented model is: **nightly `schedule` + -> manual `workflow_dispatch`** (no per-PR trigger, no CodSpeed account/token/runner), a **self-hosted -> `valgrind --tool=callgrind`** sweep that emits one dump per benchmark, and **`compare_baseline.py`** diffing -> those counts against a **committed `benchmarks/baseline.json`**. See `.github/workflows/codspeed.yml`. - -### Walltime vs instruction-count - -- **Local A/B (macOS arm64): walltime only** (no Valgrind), `--codspeed-mode=walltime`. -- **CI: instruction-count via self-hosted Callgrind (Linux)**, near-deterministic (~0.1% noise with - `PYTHONHASHSEED=0`; often bit-identical) — compared against the committed baseline, **report-only** for now - (flip `compare_baseline.py` to `--enforce` when trusted). - -### Marker split + committed-baseline gate (INFRA-1 / Phase-3) - -- Every benchmark carries exactly one of `@pytest.mark.gate` / `@pytest.mark.informational` (registered in - `conftest.py`). **gate** = binding-dominated, instruction-count-meaningful (fetchone loop, fetchall/fetchmany, - df()/fetchnumpy, native UDF per-call, native values()/executemany ingest, analyzer bind, per-element - converters). **informational** = engine/library/streaming-diluted, reported but never gated - (`to_arrow_table`/`pl()`/`to_pandas` GIL-released re-runs; registered-frame `SELECT sum()` reads; - streaming drains; the concurrency module). -- **Engine floors + Option-B (MEAS-1).** `test_engine_control_perf.py` measures `SELECT sum(...) FROM range(N)` - with no Python egress — the engine floor. At baseline **regen**, each mapped numeric-produce gate's binding - fraction `= 1 - floor_Ir/bench_Ir` is computed; a gate below the ~25% cutoff is **auto-moved to - informational** (a threshold on an engine-diluted total is not meaningful) and the fraction is stored in - `baseline.json` for audit. MEAS-1 showed OUT-row fetch and UDFs are ~all binding (stay gate); numeric - produce (`df()`/`fetchnumpy`) is a bulk memcpy of ~engine magnitude (auto-move candidate). -- **Small-N gates are compile+fetch fixed-cost**, not pure fetch (MEAS-1: ~60% compile+engine at `range(2048)`). -- **Engine-bump guard.** `compare_baseline.py` compares the committed submodule SHA against the baseline's; if - they differ, engine-inclusive deltas may reflect the engine bump, so gate deltas are not enforced (regen the - baseline for the new engine). -- **Reproducibility.** `benchmarks/requirements-bench.txt` (frozen `==` pins, from the `[dependency-groups] - bench` list) + `benchmarks/baseline.json` are the co-regenerated pair; CI installs the frozen pins (NOT the - gitignored `uv.lock`), so the only cross-run delta is the binding. - -Still **informational / do NOT gate** (engine/parallel/IO/library dominated): -- to_arrow_table / pl() on materialized results (PromoteMaterializedToArrow re-runs GIL-released). -- registered-frame `SELECT sum()` ingest reads (engine aggregate dominates). -- read_csv/parquet/json; GIL-per-chunk streaming drains. - -### New coverage dimensions (beyond the converter surface) - -- **Concurrency/GIL** (`test_concurrency_perf.py`, informational/walltime): threads {1,4,8} over a **multi-batch** - arrow scan / pandas scan / native + arrow UDF. EXCLUDED from the Callgrind sweep (Callgrind serializes threads - → its wall-clock contention signal is meaningless there); it is a local walltime tool. -- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain psutil RSS + object-count - ratio test (not a codspeed benchmark) for the object-pinning paths (register/unregister, UDF create/run/remove, - executemany). Runs in the normal test suite. -- **Memory mode (DEFERRED).** A second Callgrind sweep (`--codspeed-mode=memory`) over the O(rows) produce paths - for peak-RSS, feeding the same baseline model, is DESIGNED but not implemented this round (roughly doubles the - CI cost; nightly-only when added). The `test_mem_df_with_nulls` tracemalloc guard stays as a local signal until - then (convert it to an A/B delta when memory mode lands). - -### Two code-grounded gotchas -- **OUT-col null benchmarks need REAL DuckDB nulls** (`CASE WHEN ... THEN NULL`): the masked-array branch only triggers on an actually-invalid validity bit (`array_wrapper.cpp:396-404,736`); a no-null column silently takes the cheap `std::move` path and measures the wrong thing. -- **IN-numpy string benchmarks need mixed ASCII + non-ASCII + a NaN/pd.NA/None sentinel**: the scan zero-copies compact-ASCII (`numpy_scan.cpp:416-418`) but transcodes otherwise (`numpy_scan.cpp:429-446`); ASCII-only misses the transcode + null-detection ladder. - -## (e) Cross-check vs iqmo-org/bareduckdb - -Source read live from `iqmo-org/bareduckdb` `main`, subdir `benchmark/` (GitHub API + raw files). - -### What their suite covers / how it is organized - -A **SQL-file-driven A/B harness comparing two clients** — production `duckdb` vs `bareduckdb` (the C-API / free-threading prototype) — not a binding micro-bench. - -- `benchmark.py` orchestrates: discovers `cases/**/*.sql`, picks the matching `data/DATA*` dir, and runs each `(sql x parquet-file x db_mode)` as a fresh `uv run run_case.py` **subprocess**. `DBMODES=[duckdb, bareduckdb_capsule, bareduckdb_arrow]`; active `READ_MODES=[arrow_table]` (parquet/arrow_reader present but off). -- `run_case.py` per case: fresh `connect()`, `pyarrow.parquet.read_table(file)` + `conn.register(name, table)`, then `conn.sql(query).to_arrow_table()`, timed with `time.perf_counter()` and peak RSS via `resource.getrusage`. **No warmup, single run, result discarded.** Universal ingest = register(arrow table); universal produce = `to_arrow_table()`. -- `data/`: `DATA_RANGE` = single BIGINT `range(N)` at 5M / 100M; `DATA_CATEGORY_DATE_PRICE` = (VARCHAR category, DATE, BIGINT price) cross-join at 36M / 3.6B. -- `cases/`: `types/` (decimal `DECIMAL(28,12)`, hugeint `HUGEINT`, mixed_types `HUGEINT+uuid()+DECIMAL(28,6)+VARCHAR` in one row, timestamp `TIMESTAMP+INTERVAL`, varchar_long ~100-char), `limit/` (LIMIT 100 / 1k / 10k / 100k top-N — a result-cardinality sweep), `filter/`, `groups/`, `window/`, `threading/` (parallel group/window/self-join/registered-arrow-scan), plus a separate `stats/` harness. - -Their INGEST is arrow-only and their PRODUCE is arrow-only; they have **no** fetchall/fetchone, df()/numpy, pandas/numpy/native/polars ingest, or UDF coverage — so our binding suite is far broader on binding-specific surfaces. Their genuine deltas are concentrated in the PRODUCE/types dimension and in engine/threading workloads. - -### DELTA — actionable additions/changes - -- **[BINDING] Add HUGEINT to the produce matrix (currently absent).** `types/hugeint.sql`, `mixed_types.sql`. OUT-row `FromValue` HUGEINT does `PyLong_FromString(val.GetValue())` — a per-value string round-trip (`python_objects.cpp:500`), unlike narrow int; OUT-col casts hugeint->double (`array_wrapper.cpp:662`); OUT-arrow is a distinct decimal128/int128 export. Scenario: `SELECT i::HUGEINT FROM range(1_000_000)` through fetchall / df / to_arrow_table. Add a `hugeint` row to the type x direction matrix. -- **[BINDING] Add UUID to the produce matrix (absent).** `mixed_types.sql` selects `uuid()`. OUT-row builds a Python `uuid.UUID` per row (`python_objects.cpp:708-711`); OUT-col uses `UUIDConvert` (`array_wrapper.cpp:230-244`). Scenario: `SELECT gen_random_uuid() FROM range(200_000)` through fetchall / df / to_arrow_table. Add a `uuid` row to the matrix. -- **[BINDING] Add a 128-bit-internal DECIMAL variant.** Our P1-20 uses `DECIMAL(18,3)` (int64 internal); bareduckdb uses `DECIMAL(28,12)` / `(28,6)` (int128 internal), hitting `ConvertDecimalInternal` (`array_wrapper.cpp:571`) and the wider `PyDecimalCastSwitch`/`Decimal()` round-trip. Run both an int64-internal and an int128-internal decimal. -- **[BINDING] Add a heterogeneous mixed-type row (new scenario).** `SELECT i::HUGEINT, gen_random_uuid(), (i*1.5)::DECIMAL(28,6), ('string_'||i) FROM range(200_000)` through fetchall and df. Exercises per-cell type dispatch in the `Fetchone` column loop (`pyresult.cpp:140-148`) — a different branch/cache profile than our homogeneous columns (P0-1..3 are single-type). -- **[BINDING] Add a long-varchar (>64 char) variant** alongside the short `'str_value_'||i`. `'...'||repeat('data ',10)||i::VARCHAR` (~100 chars). Short strings are copy-cheap/overhead-bound; long strings shift OUT-row/OUT-col string copy and the IN-numpy `DecodePythonUnicode` transcode (`numpy_scan.cpp:429-446`) toward copy-bound. Apply to OUT-row, OUT-col, IN-numpy varchar scenarios. -- **[BINDING] Adopt their result-cardinality (top-N) sweep as a produce axis.** `SELECT * FROM ORDER BY k DESC LIMIT n` for n in {100, 1k, 10k, 100k}, fetched via fetchall / df / to_arrow_table with the source held constant. Holds engine work ~constant while sweeping rows-materialized-to-Python → a clean per-row conversion slope, and the small-n end is an ideal noise-free instruction-count gate (overhead regime). Cleaner than varying `range()` (which also changes scan cost). -- **[BINDING] Broaden the OUT-arrow column of the matrix.** Their entire produce path is `to_arrow_table`, and they push hugeint / decimal128 / uuid / timestamp / long-varchar / mixed-row through it — exactly the arrow-export converters (ArrowConverter/appender for int128/uuid/decimal128) our OUT-arrow column currently leaves at P1/P2 numeric+string. Add these types to OUT-arrow. -- **[BINDING, hard to gate] registered-arrow-scan under parallelism.** `threading/registered_arrow_scan.sql` pulls batches from `PythonTableArrowArrayStreamFactory::Produce` (binding code in `arrow/arrow_array_stream.cpp`) across engine threads holding/releasing the GIL — a real binding-contention risk. Keep as walltime-informational only; too noisy for an instruction-count gate. -- **[ENGINE] `filter` / `groups` / `window` / `self_join` pure-engine workloads** — out of scope for a binding gate; the binding only wraps them with register + to_arrow_table, and their consume (a small aggregate) is trivial so the measurement is ~pure engine. Note, do not add to the binding suite. -- **[ENGINE] 100M / 3.6B-row scale** — too slow / IO+engine-dominated / walltime-noisy for a codspeed gate; keep our regimes <= ~1M. -- **[ENGINE] threading / free-threading category** — the production client does not support free-threading (CLAUDE.md); deprioritize for this suite. - -### Methodology notes for our codspeed mechanics - -- **Adopt: result-cardinality (LIMIT) axis** (above) — a clean per-row conversion-cost slope and a natural small/large pairing for the instruction-count-gate-vs-walltime split already in (d). -- **Consider adopting: a peak-memory guard** for the O(rows) produce paths. bareduckdb tracks `getrusage` max RSS; codspeed walltime tracks neither memory nor allocations. A conversion regression is often memory-shaped (cf. the recorded fetchall +8% list->tuple edge-copy; the df() masked_array branch) — add a separate `getrusage`/memray delta assertion on `fetchall` and `df()`-with-nulls as a secondary signal, since a pure-timing gate can miss it. -- **Do NOT adopt their anti-patterns:** no-warmup + single subprocess run charges one-time import-cache population into the measurement and yields no statistics — bad for steady-state binding isolation. Our warmup + codspeed repeated rounds are correct; keep them. -- **Consistent with us:** their full-consume is eager `to_arrow_table()` and never `count(*)` — matches our discipline. Caveat: for their aggregate cases the arrow output is tiny, so the consume is trivial and the run is engine-only; our produce benchmarks must keep the materialization the heavy part (large output / top-N with large LIMIT). +| timestamp(tz) | P1 | P1 | **P1** (pytz/row) | P1 | P1 | +| LIST/STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 | +| hugeint/uuid | P2 | P2 | **P1** (round-trip) | P1 | P2 | +| blob/map | P2 | P2 | P2 | P2 | P2 | +| NULL-heavy | n/a | **P1** | P2 | **P0** (masked_array) | P1 | + +## Mechanics + +- **Walltime vs instruction-count.** Local A/B is walltime only (no Valgrind on macOS arm64). CI is + instruction-count via self-hosted Callgrind (near-deterministic, PYTHONHASHSEED pinned), diffed against a + committed baseline. Report-only until trusted. +- **Marker split + auto-move.** Every benchmark is `gate` or `informational` (see README). At baseline regen, + each numeric-produce gate's binding fraction `= 1 - floor_Ir / bench_Ir` is computed against its engine floor + (`test_engine_control_perf`); a gate below the ~25% cutoff is auto-moved to informational (a threshold on an + engine-diluted total is not meaningful). OUT-row fetch and UDFs are ~all binding; numeric produce is a bulk + memcpy of ~engine magnitude (auto-move candidate). +- **Guards.** compare_baseline.py warns and stops enforcing when BENCH_SCALE, the pin file, or the DuckDB + submodule SHA differ from the baseline's (any of those makes the counts non-comparable). +- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain RSS + object-count test for the + object-pinning paths, since a per-call refcount imbalance is invisible to a steady-state benchmark. +- **Memory mode** (a second Callgrind sweep for O(rows) produce peak-RSS) is designed but deferred; the + `test_mem_df_with_nulls` tracemalloc guard is the local stand-in. + +## Cross-check vs iqmo-org/bareduckdb + +Their suite is a SQL-file-driven A/B comparing two clients (production `duckdb` vs the C-API prototype), arrow-in +/ arrow-out only, no fetchall/df/numpy/native/UDF coverage. So our binding suite is far broader; their genuine +deltas concentrate in PRODUCE/types. Actionable additions they suggest: + +- **hugeint / uuid in the produce matrix** (they select both): OUT-row does a per-value string round-trip, distinct + from narrow int. Now in `test_produce_numpy_perf` / `test_fetch_perf`. +- **int128-internal decimal** (`DECIMAL(28,x)`) alongside the int64-internal one: hits a wider cast path. Added. +- **heterogeneous mixed-type row**: exercises per-cell type dispatch in the Fetchone loop, unlike homogeneous + columns. Added as `test_fetchall_mixed_wide`. +- **long varchar (>64 char)** alongside the short string: shifts string copy / transcode toward copy-bound. Added + as `varchar_long` in the matrix. +- **result-cardinality (top-N) sweep**: holds engine work ~constant while sweeping rows-to-Python. Adopted as + `test_cardinality_perf` (plain LIMIT, no ORDER BY; the sort swamped the signal). +- **peak-memory guard** on the O(rows) produce paths: a conversion regression is often memory-shaped. Partially + covered by the tracemalloc guard; full coverage waits on memory mode. + +Out of scope (theirs, not adopted): pure-engine filter/group/window workloads; 100M+ row scale (IO/engine +dominated); the free-threading category (unsupported by this client). Do NOT adopt their no-warmup single-run +methodology (charges import-cache population into the measurement). diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 00000000..ca8f8355 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,34 @@ +# Benchmark suite + +CodSpeed micro-benchmarks for the binding hot paths (produce, ingest, UDF). +Design rationale: [PLAN.md](PLAN.md). CI: [../.github/workflows/codspeed.yml](../.github/workflows/codspeed.yml). + +## Markers + +Every benchmark carries exactly one (registered in `conftest.py`): + +- **gate**: binding-dominated, GIL-held, deterministic under Callgrind. A threshold breach is a binding regression. +- **informational**: engine/library/streaming-diluted. Reported, never gated (would false-positive on engine bumps). + +## Local A/B (walltime) + +Only walltime runs locally (no Valgrind on macOS arm64; instruction-count gating is Linux/CI-only, and walltime is +noisy on sub-ms benches). Pin the data libs identically across both builds so the delta is pure binding: + +```bash +for P in ../main/.venv-release/bin/python .venv-release/bin/python; do + $P -m pytest benchmarks/.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider +done +``` + +## Conventions + +- READ aggregates real columns (`sum`/`length`), never `count(*)` (answered from metadata). +- WRITE fully materializes the result or drains the lazy reader. +- Warm once before measuring. +- `con` fixture pins `threads=1` (see `conftest.py`). + +Two traps (a benchmark that skips these silently measures the wrong thing): + +- OUT-col null benches need REAL nulls (`CASE WHEN ... THEN NULL`), else the cheap `std::move` path is taken. +- IN-numpy string benches need mixed ASCII + non-ASCII + a null sentinel, else the transcode/null ladder is skipped. diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py index b641662f..a4049aa9 100644 --- a/benchmarks/_scale.py +++ b/benchmarks/_scale.py @@ -1,17 +1,12 @@ -"""Env-gated row-count scaling for the benchmark suite (INFRA-4). - -Callgrind is 20-50x, and the O(rows) / per-row-object benchmarks at full N make the CI sweep too slow. `scaled(n)` -shrinks those row counts ONLY when an explicit `BENCH_SCALE=` env var is set (which the CI Callgrind -sweep sets). Unset -> full N, so LOCAL walltime A/B keeps the large N unchanged. - -CRITICAL: a gate benchmark and the engine-control floor it is compared against (the FLOOR_MAP pairs in -compare_baseline.py) share the same base N, so routing BOTH through `scaled()` keeps them at an identical scaled -N -- the Option-B binding_fraction stays valid. Scaling ONLY reduces row counts; it must never change the data -patterns the benchmarks depend on (real NULLs, mixed ASCII+non-ASCII+null, LIMIT-no-ORDER-BY, warm-before-measure). - -A floor keeps a scaled benchmark row-dominated (well above the range(2048) fixed-cost probes), so per-element -work still dominates and the fraction/signal stay meaningful. The small-N `*_gate` probes are NOT routed through -this (they are already fast and are the fixed-cost baseline). +"""Env-gated row-count scaling for the benchmark suite. + +Callgrind is 20-50x, so the O(rows) benches at full N make the CI sweep too slow. `scaled(n)` shrinks row counts +ONLY when `BENCH_SCALE=` is set (which the CI sweep sets); unset -> full N, so local walltime A/B is +unchanged. A gate bench and the engine floor it is compared against share a base N, so routing BOTH through +`scaled()` keeps them at an identical scaled N and the binding fraction stays valid. Scaling reduces row counts +only; it must never change the data patterns the benches depend on (real nulls, mixed ASCII, LIMIT-no-ORDER-BY). +A floor keeps a scaled bench row-dominated so per-element work still dominates; the small-N `*_gate` probes are +already fast and are NOT scaled. """ from __future__ import annotations diff --git a/benchmarks/compare_baseline.py b/benchmarks/compare_baseline.py index 85e96c43..ab9773f9 100644 --- a/benchmarks/compare_baseline.py +++ b/benchmarks/compare_baseline.py @@ -1,27 +1,17 @@ #!/usr/bin/env python3 -"""Committed-baseline instruction-count comparison for the CodSpeed benchmark suite. - -WHY / HOW (grounded, verified on a Linux+valgrind box): - The suite runs under `valgrind --tool=callgrind` with pytest-codspeed. pytest-codspeed's hooks call - `callgrind_dump_stats_at()` at the end of each benchmark, so callgrind writes ONE dump file per - benchmark, headed by `desc: Trigger: Client Request: ` with the instruction count on the `totals:` - line (`events: Ir`). The hooks also obj-skip libpython, so counts are clean. NO CodSpeed account, token, or - runner binary is involved -- this parses the raw callgrind dumps directly. - - Observed run-to-run noise on that box was ~0.1% (callgrind is near-deterministic, not bit-identical), so the - default gate threshold (5%) sits far above noise. PYTHONHASHSEED is pinned in CI to keep dict/struct paths - stable. - -TWO MODES: - regen -- build benchmarks/baseline.json from a fresh valgrind run: per-benchmark instruction counts + - provenance meta + (for the mapped numeric-produce gates) the engine-diluted binding fraction, and - the Option-B auto-move of any gate below the cutoff to `informational`. - compare -- parse a fresh valgrind run, diff each benchmark against baseline.json, and print a report. GATE - benchmarks over their threshold are regressions; `informational` benchmarks are reported only. - REPORT-ONLY by default (always exit 0); `--enforce` exits non-zero on a gate regression. - -Both are CI-only in practice (no valgrind on macOS arm64). baseline.json and benchmarks/requirements-bench.txt -are regenerated together (same job) so the counts always correspond to the frozen data-lib pins. +"""Committed-baseline instruction-count comparison for the benchmark suite. See benchmarks/README.md. + +pytest-codspeed's hooks call `callgrind_dump_stats_at()` per benchmark, so callgrind writes ONE dump each, +headed by `desc: Trigger: Client Request: ` with the count on `totals:` (`events: Ir`). This parses those +raw dumps directly (no CodSpeed account/token/runner). Run-to-run noise is ~0.1%, so the 5% gate threshold sits +far above it (PYTHONHASHSEED pinned in CI). + +Two modes (CI-only; no valgrind on macOS arm64): + regen: write baseline.json from a fresh run: counts + provenance + binding fractions + auto-move. + compare: diff a fresh run against baseline.json. Gate benches over threshold are regressions; informational + are reported only. Report-only by default; `--enforce` exits non-zero on a gate regression. + +baseline.json and benchmarks/requirements-bench.txt are regenerated together so counts match the frozen pins. """ from __future__ import annotations @@ -37,14 +27,13 @@ SCHEMA_VERSION = 1 GATE_DEFAULT_THRESHOLD_PCT = 5.0 -BINDING_FRACTION_CUTOFF = 0.25 # Option-B: a gate whose isolable binding fraction is below this is auto-moved -# to informational (a threshold on its engine-diluted total is not meaningful). - -# Option-B floor map: the engine-control benchmark whose instruction count is the "engine floor" of a given -# numeric-produce gate. binding_fraction = 1 - floor_Ir / bench_Ir. ONLY the numeric-produce benches are listed: -# MEAS-1 showed their per-element binding is a bulk memcpy (~engine magnitude); every other gate (OUT-row fetch -# of any type, string/nested/decimal/hugeint/uuid produce, UDFs, native ingest, analyzer bind) is high-binding -# and needs no fraction. Add a mapping (and, if needed, an engine floor) here to evaluate more benches. +BINDING_FRACTION_CUTOFF = 0.25 # a gate whose isolable binding fraction is below this is auto-moved to +# informational (a threshold on its engine-diluted total is not meaningful). + +# Floor map: the engine-control bench that is the "engine floor" of a numeric-produce gate. +# binding_fraction = 1 - floor_Ir / bench_Ir. ONLY numeric-produce benches are listed (their per-element binding +# is a bulk memcpy of ~engine magnitude); every other gate is high-binding and needs no fraction. Add a mapping +# (and, if needed, a floor) to evaluate more benches. _E = "benchmarks/test_engine_control_perf.py" FLOOR_MAP = { "benchmarks/test_produce_numpy_perf.py::test_df_numeric": f"{_E}::test_engine_sum_2col_500k", @@ -65,11 +54,7 @@ def _normalize_uri(raw: str) -> str: - """Return a repo-relative benchmark key. - - Inside a git repo pytest-codspeed already emits a git-relative uri (e.g. `benchmarks/x.py::test[p]`); this - defensively strips a leading absolute path if the run happened outside a git repo. - """ + """Return a repo-relative benchmark key (strip a leading absolute path if the run was outside a git repo).""" raw = raw.strip() if "::" not in raw: return raw @@ -83,8 +68,8 @@ def _normalize_uri(raw: str) -> str: def parse_profiles(profile_dir: Path) -> dict[str, int]: """Parse every callgrind dump in `profile_dir`; return {benchmark_uri: instruction_count}. - Only dumps whose Trigger is a benchmark Client Request (contains `::`) are kept; the metadata and - program-termination dumps are skipped. If a uri appears more than once (should not happen) the max is kept. + Keeps only dumps whose Trigger is a benchmark Client Request (contains `::`); skips metadata/termination + dumps. If a uri appears more than once (should not happen) the max is kept. """ counts: dict[str, int] = {} files = sorted(profile_dir.rglob("*")) if profile_dir.exists() else [] @@ -236,9 +221,8 @@ def compare(args: argparse.Namespace) -> int: "may not be pure binding. Regenerate the baseline with the current pins." ) - # engine-bump guard: engine-inclusive counts shift when the bundled DuckDB submodule changes, for reasons - # unrelated to the binding. If the current submodule SHA differs from the baseline's, do not treat gate - # deltas as hard failures (they may reflect the engine bump); warn to regenerate the baseline. + # engine-bump guard: engine-inclusive counts shift when the DuckDB submodule changes. If the SHA differs from + # the baseline's, don't treat gate deltas as hard failures (they may reflect the bump); warn to regenerate. engine_changed = bool( args.submodule_sha and meta.get("duckdb_submodule_sha") and args.submodule_sha != meta["duckdb_submodule_sha"] ) diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py index b1ccd604..07cba4e4 100644 --- a/benchmarks/conftest.py +++ b/benchmarks/conftest.py @@ -1,22 +1,8 @@ -"""Shared fixtures + marker registration for the CodSpeed benchmark suite. - -Central home (INFRA-6) for the `con` fixture, the `threads=1` isolation default, and the gate/informational -marker registration (INFRA-1). Markers are registered HERE (not via pyproject `markers=`) to keep the suite -self-contained. Registration is REQUIRED: pyproject sets `filterwarnings = ["error"]`, so an unregistered -mark would raise `PytestUnknownMarkWarning` as a collection error. - -Marker semantics - gate Binding-dominated, GIL-held, deterministic under Callgrind (instruction-count). These are the - paths where a threshold breach means a *binding* regression. Gate-able. (Enforcement against a - committed baseline is a later phase; for now they run and report.) - informational Engine/parallel/IO/library-diluted, streaming drains, or arrow-export re-run paths. Reported, - never gated: their instruction count is dominated by non-binding work (engine aggregate, the - bundled DuckDB submodule, pyarrow/polars library code), so gating them would false-positive on - engine/submodule bumps rather than catch binding regressions. - -Every benchmark (a test using the `benchmark` fixture) must carry EXACTLY ONE of these markers so the two CI -steps (`-m gate`, `-m informational`) together cover the suite with no overlap. Non-benchmark guards (e.g. the -tracemalloc assertion in test_produce_numpy_perf.py) are intentionally left unmarked and run in neither step. +"""Shared fixtures + marker registration for the benchmark suite. See benchmarks/README.md. + +Markers are registered here (not via pyproject `markers=`) because pyproject sets `filterwarnings = ["error"]`, +so an unregistered mark would raise as a collection error. Every benchmark must carry EXACTLY ONE of `gate` / +`informational` so the two CI steps (`-m gate`, `-m informational`) cover the suite with no overlap. """ from __future__ import annotations @@ -24,7 +10,7 @@ from typing import TYPE_CHECKING import pytest -from _scale import bench_scale, scaled # noqa: F401 (re-exported here as the shared home; used by the modules) +from _scale import bench_scale, scaled # noqa: F401 (re-exported as the shared home; used by the modules) import duckdb @@ -32,13 +18,6 @@ from collections.abc import Iterator -# ENV-GATED ROW COUNTS (INFRA-4): the O(rows) / per-row-object benchmarks route their N through `scaled()` -# (benchmarks/_scale.py). Unset `BENCH_SCALE` -> full N (local walltime A/B is unchanged); the CI Callgrind -# sweep sets `BENCH_SCALE=` to shrink N so the sweep fits under the job timeout. A gate benchmark and -# its engine-control floor (FLOOR_MAP in compare_baseline.py) share a base N, so both scale identically and the -# Option-B binding fraction stays valid. Scaling changes ONLY row counts, never the Do-NOT-regress data patterns. - - def pytest_configure(config: pytest.Config) -> None: """Register the gate/informational markers (required under filterwarnings=error).""" config.addinivalue_line( @@ -55,8 +34,8 @@ def pytest_configure(config: pytest.Config) -> None: def con() -> Iterator[duckdb.DuckDBPyConnection]: """Yield a fresh single-threaded connection, closed on teardown. - `threads=1` pins engine parallelism so per-run instruction counts and walltime do not shift with the CI - runner core count (INFRA-6). The concurrency module (COV-1, a later phase) overrides this deliberately. + `threads=1` pins engine parallelism so counts/walltime don't shift with the runner core count. The + concurrency module overrides this deliberately. """ c = duckdb.connect(config={"threads": 1}) yield c diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt index e230d715..8a9f49e6 100644 --- a/benchmarks/requirements-bench.txt +++ b/benchmarks/requirements-bench.txt @@ -1,15 +1,9 @@ -# Frozen, exact pins for the CodSpeed benchmark suite (.github/workflows/codspeed.yml). -# -# WHY a committed pin file (and NOT the gitignored uv.lock, and NOT a re-resolving `>=` group): CodSpeed -# compares instruction counts across runs. If a data lib (numpy/pandas/pyarrow/polars) changed version between -# the baseline run and a later run, that delta would be misattributed to the binding. These pins freeze the data -# libs so the ONLY cross-run delta is the binding. Regenerate this file DELIBERATELY, together with the baseline. -# -# SOURCE OF TRUTH: the human-readable `[dependency-groups] bench` list in pyproject.toml. Regenerate with: +# Frozen pins for the benchmark suite: freezing the data libs means the only cross-run delta is the binding. +# Regenerate DELIBERATELY, together with the baseline. Source of truth: the `[dependency-groups] bench` list in +# pyproject.toml (torch/tensorflow deliberately absent, local-only via importorskip). Regenerate with: # uv pip compile pyproject.toml --group bench \ # --python-version 3.13 --python-platform x86_64-unknown-linux-gnu \ # --no-annotate --no-header -o benchmarks/requirements-bench.txt -# (py3.13 / linux-x86_64 is the CI target.) torch/tensorflow are deliberately absent (local-only via importorskip). iniconfig==2.3.0 markdown-it-py==4.2.0 mdurl==0.1.2 diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py index 0fd8461f..de05f78e 100644 --- a/benchmarks/test_arrow_perf.py +++ b/benchmarks/test_arrow_perf.py @@ -1,14 +1,6 @@ -"""CodSpeed benchmark: Arrow read/write paths. Standalone, not in CI. +"""Arrow read/write: Table + RecordBatchReader + dictionary sweep. See benchmarks/README.md. -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_arrow_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -Data must be fully moved or nothing is measured: READ aggregates over real columns (sum/length, not count(*), -which arrow answers from metadata); WRITE materializes the result (to_arrow_reader is lazy, so it is drained). +READ aggregates over real columns (arrow answers count(*) from metadata); WRITE drains the lazy reader. """ from __future__ import annotations @@ -26,22 +18,18 @@ import duckdb -N = scaled(500_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) -DICT_UNIQUE = [2, 1_000, 50_000] # cardinality sweep: UNIQUE-value counts (not row counts) -> NOT scaled +N = scaled(500_000) +DICT_UNIQUE = [2, 1_000, 50_000] # UNIQUE-value counts (cardinality sweep), not row counts -> NOT scaled WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" -# informational: every benchmark here is engine-parallel or library/streaming dominated -> reported, not gated. -# READ (sum over registered arrow) -> engine aggregate dominates; the near-zero-copy scan is a small fraction. -# WRITE to_arrow_table/to_arrow_reader/pl() -> PromoteMaterializedToArrow re-runs the query GIL-released -# (engine-parallel), and pl() also runs polars library code. Their counts would trip on engine/submodule -# bumps, not binding regressions. `con` fixture + threads=1 live in conftest.py. +# informational: every bench here is engine-parallel or library/streaming dominated. READ = engine aggregate +# dominates; WRITE (to_arrow/pl) re-runs the query GIL-released. Would trip on engine/submodule bumps, not binding. pytestmark = pytest.mark.informational @pytest.fixture(scope="module") def arrow_numeric() -> pa.Table: - """Return a two-column numeric arrow table.""" return pa.table( { "a": pa.array(range(N), type=pa.int64()), @@ -52,20 +40,17 @@ def arrow_numeric() -> pa.Table: @pytest.fixture(scope="module") def arrow_string() -> pa.Table: - """Return a single-column string arrow table.""" return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())}) @pytest.fixture(scope="module") def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.RecordBatch]]: - """Return the schema and record batches for the numeric table.""" # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000) @pytest.fixture(scope="module") def arrow_dict_tables() -> dict[int, pa.Table]: - """Return dictionary-encoded arrow tables keyed by number of unique values (a cardinality sweep).""" # deterministic indices (i % U) so the instruction count is reproducible (no PRNG) tables = {} for u in DICT_UNIQUE: @@ -75,38 +60,29 @@ def arrow_dict_tables() -> dict[int, pa.Table]: return tables -# --------------------------------------------------------------------------- # -# READ: arrow -> duckdb. The engine must scan every value (sum/length force it). -# --------------------------------------------------------------------------- # +# READ: arrow -> duckdb. sum/length force a full scan. def test_read_arrow_numeric( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric: pa.Table ) -> None: - """Benchmark scanning a numeric arrow table.""" con.register("t_num", arrow_numeric) - con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall() # warm (MEAS-3) + con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall() # warm benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()) def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None: - """Benchmark scanning a string arrow table.""" con.register("t_str", arrow_string) - con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall() # warm (MEAS-3) + con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall() # warm benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()) -# ADDED: RecordBatchReader ingest -- the SAME PythonTableArrowArrayStreamFactory but STREAMING (distinct from -# the materialized Table read above). A fresh reader is built per round (the engine drains it); sum() forces a -# full scan of every value. - - def test_read_arrow_reader_numeric( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric_batches: tuple[pa.Schema, list[pa.RecordBatch]], ) -> None: - """Benchmark scanning a streaming record-batch reader.""" + # same factory as the Table read, but STREAMING: a fresh reader per round, drained by the engine schema, batches = arrow_numeric_batches def run() -> list: @@ -118,43 +94,32 @@ def run() -> list: benchmark(run) -# ADDED (COV-4): dictionary-encoded arrow ingest, cardinality sweep (unique in {2, 1k, high}). Mirrors core's -# test_arrow_dictionaries_scan. The engine aggregate dominates (hence informational), but the per-value -# dictionary DECODE in the arrow scan is the binding interest, and its cost slopes with the unique count. - - @pytest.mark.parametrize("unique", DICT_UNIQUE) def test_read_arrow_dictionary( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_dict_tables: dict[int, pa.Table], unique: int ) -> None: - """Benchmark scanning a dictionary-encoded arrow column at a given cardinality.""" + # per-value dictionary DECODE cost slopes with the unique count (mirrors core test_arrow_dictionaries_scan) con.register("t_dict", arrow_dict_tables[unique]) con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall() # warm benchmark(lambda: con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall()) -# --------------------------------------------------------------------------- # -# WRITE: duckdb -> arrow, consumer fully materializes / fully drains the stream. -# --------------------------------------------------------------------------- # +# WRITE: duckdb -> arrow, consumer fully materializes / drains the stream. def test_write_arrow_table_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a numeric result to an arrow table.""" benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table()) def test_write_arrow_table_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a string result to an arrow table.""" benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table()) def test_write_arrow_reader_consumed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark draining a lazy arrow record-batch reader.""" - def run() -> int: reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000) rows = 0 - for batch in reader: # drain the lazy stream so duckdb actually produces every batch + for batch in reader: # drain the lazy stream so duckdb produces every batch rows += batch.num_rows return rows @@ -162,10 +127,8 @@ def run() -> int: def test_write_polars_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a numeric result to a polars frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).pl()) def test_write_polars_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a string result to a polars frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).pl()) diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py index 6e7af136..751c6cf5 100644 --- a/benchmarks/test_cardinality_perf.py +++ b/benchmarks/test_cardinality_perf.py @@ -1,16 +1,8 @@ -"""CodSpeed benchmark: the result-cardinality (rows-to-Python) sweep. Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_cardinality_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -Sweeps `SELECT * FROM src LIMIT n` (no ORDER BY) over a pre-materialized 3-column source: a plain LIMIT -early-stops the scan, so the per-row conversion dominates and the slope is monotone in n. A steeper slope on -one build is a per-row conversion regression. n=100 is the overhead regime, n=100_000 is throughput. -(An earlier ORDER BY version was dropped: the top-N sort swamped the signal.) +"""Result-cardinality (rows-to-Python) sweep via LIMIT n, no ORDER BY. See benchmarks/README.md. + +`SELECT * FROM src LIMIT n` early-stops the scan, so per-row conversion dominates and the slope is monotone in n. +A steeper slope on one build is a per-row conversion regression. n=100 is overhead, n=100_000 is throughput. +(An ORDER BY version was dropped: the top-N sort swamped the signal.) """ from __future__ import annotations @@ -27,18 +19,15 @@ from pytest_codspeed import BenchmarkFixture -# env-gated (INFRA-4): scale the source rows AND the top-N of the sweep by the same factor, keeping the small-N -# points fixed and SRC_ROWS >= max(LIMITS). Preserves the LIMIT-no-ORDER-BY early-stop pattern (Do-NOT-regress). +# scale the source rows AND the top-N by the same factor, keeping small-N points fixed and SRC_ROWS >= max(LIMITS). SRC_ROWS = scaled(200_000) LIMITS = [100, 1_000, 10_000, scaled(100_000)] @pytest.fixture(scope="module") def con() -> Iterator[duckdb.DuckDBPyConnection]: - """Yield a connection over a once-materialized source table.""" - # Fixed source materialized ONCE (module-scoped): building it per test would add noise, and it must be - # identical across the n sweep. `SELECT * FROM src LIMIT n` then reads only the first n rows. - c = duckdb.connect(config={"threads": 1}) # pin engine parallelism (INFRA-6); module-scoped source table + # source materialized ONCE (module-scoped) and identical across the n sweep; per-test build would add noise + c = duckdb.connect(config={"threads": 1}) c.execute( "CREATE TABLE src AS " f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)" @@ -48,15 +37,12 @@ def con() -> Iterator[duckdb.DuckDBPyConnection]: def _query(n: int) -> str: - # No ORDER BY: a plain LIMIT early-stops the scan at n rows -> engine cost cheap and monotone in n, so the - # per-row binding conversion dominates the n-varying signal (unlike the old ORDER BY top-N sort). return f"SELECT a, b, s FROM src LIMIT {n}" -@pytest.mark.gate # fetchall materializes n rows to Python -> binding-dominated; small-n end is the noise-free gate +@pytest.mark.gate # fetchall materializes n rows -> binding-dominated; small-n end is the noise-free gate @pytest.mark.parametrize("n", LIMITS) def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: - """Benchmark fetchall over a LIMIT n sweep.""" q = _query(n) con.execute(q).fetchall() # warm benchmark(lambda: con.execute(q).fetchall()) @@ -65,7 +51,6 @@ def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnect @pytest.mark.gate # df() materializes n rows to numpy columns -> binding-dominated @pytest.mark.parametrize("n", LIMITS) def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: - """Benchmark df() over a LIMIT n sweep.""" q = _query(n) con.sql(q).df() # warm benchmark(lambda: con.sql(q).df()) @@ -74,7 +59,6 @@ def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n @pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated @pytest.mark.parametrize("n", LIMITS) def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: - """Benchmark to_arrow_table() over a LIMIT n sweep.""" q = _query(n) con.sql(q).to_arrow_table() # warm benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_concurrency_perf.py b/benchmarks/test_concurrency_perf.py index 8be28619..c55b0274 100644 --- a/benchmarks/test_concurrency_perf.py +++ b/benchmarks/test_concurrency_perf.py @@ -1,30 +1,16 @@ -"""CodSpeed benchmark: concurrency / GIL pressure (COV-1). informational / WALLTIME. Standalone, not gated. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_concurrency_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -This is the ONE dimension the rest of the suite (single-threaded) cannot see: Python objects threading through -PARALLEL core execution. It varies `SET threads` and measures the binding under parallel scan / parallel UDF -invocation. All benchmarks are `informational` and their PRIMARY signal is LOCAL WALLTIME: - * scan benches -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup. - * native UDF -> ~flat scaling = the GIL tax on per-row Python calls (the engine scan is parallel - but the GIL serializes the calls). - * arrow (vectorized) UDF -> observed NEGATIVE scaling (slower with more threads): per-chunk convert + GIL - contention. A regression here would deepen the negative slope. - -Under the CI `-m informational` step these run in `simulation` (Callgrind), which SERIALIZES threads -- so the -wall-clock contention is NOT visible there; instead the deterministic instruction count captures the per-batch -Produce GIL calls and the UDF dispatch overhead. Never gated either way. - -GOTCHA (verified locally, mirrors the suite's other "measure the right thing" traps): a SINGLE-BATCH arrow table -does NOT parallelize (one batch = one serial scan unit; flat across threads). The arrow scan bench MUST use a -MULTI-BATCH table (`from_batches` with a modest chunksize) or it silently measures a serial scan. A CPU-heavy -aggregate is also required: a cheap sum is memory-bandwidth-bound and will not parallelize, so there is nothing -to contend on. +"""Concurrency / GIL pressure across thread counts. Walltime-only, never gated. See benchmarks/README.md. + +The ONE dimension the single-threaded rest of the suite cannot see: Python objects threading through PARALLEL +core execution. Primary signal is LOCAL WALLTIME: + * scan benches -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup. + * native UDF -> ~flat scaling = the GIL tax on per-row Python calls. + * arrow UDF -> observed NEGATIVE scaling (per-chunk convert + GIL contention). + +Under CI Callgrind threads are serialized, so wall-clock contention is invisible there; the deterministic count +still captures per-batch Produce GIL calls + UDF dispatch. Never gated either way. + +GOTCHA: a SINGLE-BATCH arrow table does NOT parallelize (one batch = one serial scan unit). The arrow scan bench +MUST use a MULTI-BATCH table AND a CPU-heavy aggregate (a cheap sum is bandwidth-bound and won't parallelize). """ from __future__ import annotations @@ -44,23 +30,21 @@ import numpy as np # noqa: E402 (after importorskip, matching the suite convention) import pandas as pd # noqa: E402 -# informational: concurrency benchmarks are never gated (walltime-noisy; under Callgrind, thread-serialized). pytestmark = pytest.mark.informational N_SCAN = 1_000_000 -BATCH = 20_000 # -> 50 record batches; MULTI-BATCH is required for the arrow scan to parallelize (see GOTCHA) +BATCH = 20_000 # -> 50 record batches; MULTI-BATCH required for the arrow scan to parallelize (see GOTCHA) N_UDF_NATIVE = 200_000 # native UDF = one Python call per row; keep modest (Callgrind instruments every call) N_UDF_ARROW = 1_000_000 # arrow UDF = one call per chunk (vectorized) THREADS = [1, 4, 8] -# CPU-heavy aggregate so the parallel scan actually engages worker threads (a cheap sum is bandwidth-bound and -# would not parallelize -> no contention to measure). The binding signal is the per-batch Produce GIL handoff. +# CPU-heavy aggregate so the parallel scan engages worker threads. The binding signal is the per-batch Produce +# GIL handoff. HEAVY = "sin(a) * cos(b) + sqrt(abs(a)) + ln(abs(a) + 1)" @pytest.fixture(scope="module") def arrow_multibatch() -> pa.Table: - """Return a MULTI-batch arrow table (single-batch would scan serially -- see module GOTCHA).""" a = pa.array(np.arange(N_SCAN), type=pa.int64()) b = pa.array(np.arange(N_SCAN, dtype="float64") * 1.5, type=pa.float64()) return pa.Table.from_batches(pa.table({"a": a, "b": b}).to_batches(max_chunksize=BATCH)) @@ -68,19 +52,15 @@ def arrow_multibatch() -> pa.Table: @pytest.fixture(scope="module") def pandas_frame() -> pd.DataFrame: - """Return a numpy-backed pandas frame (its scan parallelizes across worker threads).""" return pd.DataFrame({"a": np.arange(N_SCAN), "b": np.arange(N_SCAN, dtype="float64") * 1.5}) -# --------------------------------------------------------------------------- # -# Parallel SCAN: Python objects (arrow batches / pandas chunks) pulled through the binding by engine worker -# threads under a CPU-heavy aggregate. The scan Produce acquires/releases the GIL per batch across threads. -# --------------------------------------------------------------------------- # +# Parallel SCAN: arrow batches / pandas chunks pulled through the binding by engine worker threads; the scan +# Produce acquires/releases the GIL per batch across threads. @pytest.mark.parametrize("threads", THREADS) def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.Table, threads: int) -> None: - """Benchmark a parallel aggregate pulling arrow batches across threads.""" con = duckdb.connect(config={"threads": threads}) try: con.register("t", arrow_multibatch) @@ -93,7 +73,6 @@ def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.T @pytest.mark.parametrize("threads", THREADS) def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.DataFrame, threads: int) -> None: - """Benchmark a parallel aggregate pulling pandas chunks across threads.""" con = duckdb.connect(config={"threads": threads}) try: con.register("t", pandas_frame) @@ -104,15 +83,12 @@ def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.Data con.close() -# --------------------------------------------------------------------------- # # Parallel UDF: the engine scans a MATERIALIZED table (range() does not parallelize) and invokes a Python UDF -# from multiple worker threads. Native = per-row Python call under the GIL (GIL tax); arrow = per-chunk convert. -# --------------------------------------------------------------------------- # +# from multiple worker threads. Native = per-row call under the GIL (GIL tax); arrow = per-chunk convert. @pytest.mark.parametrize("threads", THREADS) def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None: - """Benchmark a native Python UDF invoked from parallel worker threads (GIL tax).""" con = duckdb.connect(config={"threads": threads}) try: con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_NATIVE}) s(i)") # materialized -> parallel scan @@ -125,7 +101,6 @@ def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None: @pytest.mark.parametrize("threads", THREADS) def test_udf_arrow_parallel(benchmark: BenchmarkFixture, threads: int) -> None: - """Benchmark a vectorized arrow UDF invoked from parallel worker threads.""" con = duckdb.connect(config={"threads": threads}) try: con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_ARROW}) s(i)") # materialized -> parallel scan diff --git a/benchmarks/test_engine_control_perf.py b/benchmarks/test_engine_control_perf.py index febd7ba4..faee4de4 100644 --- a/benchmarks/test_engine_control_perf.py +++ b/benchmarks/test_engine_control_perf.py @@ -1,17 +1,8 @@ -"""CodSpeed benchmark: pure-ENGINE control (no Python egress). Standalone, not in CI's binding gate. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_engine_control_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -These `SELECT sum(...) FROM range(N)` queries aggregate to a single scalar, so the fetchall of the result is -negligible: they measure SQL compile + the engine aggregate with (almost) ZERO per-row Python egress. They are -the "engine floor" reference for MEAS-1: comparing a produce/fetch/ingest benchmark against the matching-N floor -here quantifies how much of that benchmark's cost is the binding vs the engine. They are `informational` (they -measure the engine, not the binding, so they must never gate). +"""Pure-engine floor (no Python egress): the binding-fraction reference. See benchmarks/README.md. + +`SELECT sum(...) FROM range(N)` aggregates to one scalar, so the fetch is negligible: these measure SQL compile + +the engine aggregate with ~zero per-row egress. Comparing a produce/fetch bench against the matching-N floor here +quantifies how much of its cost is binding vs engine. Informational (they measure the engine), never gated. """ from __future__ import annotations @@ -26,19 +17,14 @@ import duckdb -# informational: pure-engine reference, never gated. `con` fixture + threads=1 live in conftest.py. pytestmark = pytest.mark.informational -# Matched to the N of the fetch/produce/ingest/udf benchmarks so the floors line up for MEAS-1 subtraction and, -# at baseline regen, for the Option-B binding-fraction of the numeric-produce gates (see compare_baseline.py). -# CRITICAL: these floors go through scaled() with the SAME base N as the benchmarks they floor, so under -# BENCH_SCALE the floor and its benchmark stay at an identical N and the fraction stays valid. The 2048 small-N -# floor is NOT scaled (it is the fixed-cost baseline for the *_gate probes). -Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)" # small-N gate floor (compile-dominated), NOT scaled +# N matched to the benches these floor, and routed through scaled() with the SAME base N, so the floor and its +# bench stay at an identical scaled N and the binding fraction stays valid. The 2048 small-N floor is NOT scaled. +Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)" # small-N gate floor (compile-dominated) Q_1C_100K = f"SELECT sum(i::BIGINT) FROM range({scaled(100_000)}) t(i)" # types-matrix numeric-df floor Q_1C_200K = f"SELECT sum(i::BIGINT) FROM range({scaled(200_000)}) t(i)" # fetch / native-UDF floor -# produce/ingest floor -Q_2C_500K = ( +Q_2C_500K = ( # produce/ingest floor f"SELECT sum(a), sum(b) FROM (SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({scaled(500_000)}) t(i))" ) @@ -49,20 +35,16 @@ def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: s def test_engine_sum_1col_small(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Engine floor: compile + sum over range(2048), no egress.""" _bench(benchmark, con, Q_1C_SMALL) def test_engine_sum_1col_100k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Engine floor: compile + sum over range(100k), no egress.""" _bench(benchmark, con, Q_1C_100K) def test_engine_sum_1col_200k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Engine floor: compile + sum over range(200k), no egress.""" _bench(benchmark, con, Q_1C_200K) def test_engine_sum_2col_500k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Engine floor: compile + 2-col sum over range(500k), no egress.""" _bench(benchmark, con, Q_2C_500K) diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py index 9820db6d..1aa5f4fe 100644 --- a/benchmarks/test_fetch_perf.py +++ b/benchmarks/test_fetch_perf.py @@ -1,15 +1,4 @@ -"""CodSpeed benchmark: row fetch paths (fetchall, fetchone iteration, expression construction). Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_fetch_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -Only walltime works locally (no Valgrind on macOS arm64); the deterministic instruction-count mode needs Linux (CI). -Walltime is noisy on sub-ms benchmarks. -""" +"""OUT-row fetch: fetchall, fetchone/fetchmany loops, wide/expensive scalar types. See benchmarks/README.md.""" from __future__ import annotations @@ -23,14 +12,12 @@ import duckdb -# gate: OUT-row fetch fully materializes every row to Python -> binding-dominated, GIL-held; the engine side is -# a cheap range() scan. Deterministic under Callgrind -> instruction-count gate-able. (The small-N *_gate tests -# are the compile+fetch fixed-cost variants; see MEAS-1.) The `con` fixture + threads=1 live in conftest.py. +# gate: OUT-row fetch materializes every row to Python (binding-dominated); the range() scan is cheap. pytestmark = pytest.mark.gate -# env-gated row counts (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. The 2048 -# small-N *_gate probes are intentionally NOT scaled (they are the compile+fetch fixed-cost baseline). -N_ROW = scaled(200_000) # per-row-object numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128) +# scaled() shrinks N under BENCH_SCALE in the CI sweep; full N locally. The range(2048) *_gate probes are the +# compile+fetch fixed-cost baseline and are deliberately NOT scaled. +N_ROW = scaled(200_000) # numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128) N_STR = scaled(100_000) # varchar/blob/mixed-wide/timestamptz + fetchone/fetchmany loops N_NEST = scaled(50_000) # heterogeneous scalar/list/struct row @@ -41,32 +28,26 @@ def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a single BIGINT column.""" _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a FROM range({N_ROW}) t(i)") def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a single INTEGER column.""" _bench_fetchall(benchmark, con, f"SELECT (i % 100)::INTEGER AS a FROM range({N_ROW}) t(i)") def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a single DOUBLE column.""" _bench_fetchall(benchmark, con, f"SELECT (i * 1.5)::DOUBLE AS a FROM range({N_ROW}) t(i)") def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of two BIGINT columns.""" _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range({N_ROW}) t(i)") def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a single VARCHAR column.""" _bench_fetchall(benchmark, con, f"SELECT ('str_value_' || i) AS s FROM range({N_STR}) t(i)") def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a mixed scalar/list/struct row.""" query = ( "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, " f"{{'a': i, 'b': i + 1}} AS st FROM range({N_NEST}) t(i)" @@ -75,7 +56,6 @@ def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnect def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark iterating a result one row at a time with fetchone.""" query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)" def run() -> None: @@ -86,52 +66,40 @@ def run() -> None: benchmark(run) -# --------------------------------------------------------------------------- # -# small-N COMPILE+FETCH FIXED-COST variants: at range(2048) the measured region is dominated by SQL front-end -# compilation + the engine, NOT fetch. MEAS-1 walltime split (vs the range(2048) engine floor in -# test_engine_control_perf.py): ~40% fetch fixed-cost, ~60% compile+engine. They still catch a fixed-cost -# regression, but they are compile+fetch fixed-cost gates, not pure-fetch gates. Plus expensive scalar OUT-row -# types (timestamptz pytz-per-row, blob, null-heavy), a heterogeneous per-cell-dispatch row -# (hugeint+uuid+decimal128+varchar, distinct from the homogeneous columns), and the batched fetchmany loop. -# --------------------------------------------------------------------------- # +# small-N *_gate variants: at range(2048) the measured region is ~60% SQL compile + engine, ~40% fetch, so these +# catch a fixed-cost regression (not a pure per-row one). Plus expensive scalar types (timestamptz pytz-per-row, +# blob, null-heavy), a heterogeneous per-cell-dispatch row, and the batched fetchmany loop. def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark the small-N BIGINT compile+fetch fixed-cost (MEAS-1: ~60% compile+engine, ~40% fetch).""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)") def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark the small-N two-BIGINT compile+fetch fixed-cost.""" _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)") def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a half-NULL BIGINT column.""" _bench_fetchall(benchmark, con, f"SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range({N_ROW}) t(i)") def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a TIMESTAMPTZ column.""" _bench_fetchall( benchmark, con, f"SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range({N_STR}) t(i)" ) def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a 128-bit DECIMAL column.""" _bench_fetchall(benchmark, con, f"SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range({N_ROW}) t(i)") def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a BLOB column.""" _bench_fetchall(benchmark, con, f"SELECT ('blob_value_' || i)::BLOB FROM range({N_STR}) t(i)") def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchall of a heterogeneous wide-type row.""" - # heterogeneous row -> per-cell type dispatch in the Fetchone column loop (distinct branch/cache profile - # from the homogeneous single-type columns above) + # heterogeneous row: per-cell type dispatch in the Fetchone loop (distinct branch/cache profile from the + # homogeneous single-type columns above) query = ( "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, " f"((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range({N_STR}) t(i)" @@ -140,7 +108,6 @@ def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyCo def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark draining a result with batched fetchmany.""" query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)" def run() -> None: diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py index c54ddba7..3478ea1c 100644 --- a/benchmarks/test_ingest_native_perf.py +++ b/benchmarks/test_ingest_native_perf.py @@ -1,15 +1,7 @@ -"""CodSpeed benchmark: native Python-object ingest (list/tuple/dict -> duckdb). Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_ingest_native_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done +"""Native Python-object ingest: values() list/tuple/dict, executemany. See benchmarks/README.md. Every cell goes through TransformPythonValue; dicts recurse to STRUCT; executemany re-binds per row. Note: one list arg to values() is ONE row whose columns are the list items, so a list of N items transforms N cells. -executemany writes to a real table (CREATE OR REPLACE each round so it doesn't grow across repeats). """ from __future__ import annotations @@ -24,49 +16,39 @@ import duckdb -# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. EXECMANY_N = scaled(20_000) # executemany re-binds + executes per row, keep moderate WIDE_N = scaled(10_000) # values() builds a 1-row x N-col relation; cap N so the binder stays sane -# gate: native ingest eagerly transforms every cell (TransformPythonValue) / re-binds per row (executemany); -# the engine side (a trivial INSERT or a 1-row-wide fetchall drain) is negligible -> binding-dominated, GIL-held, -# deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py. +# gate: native ingest eagerly transforms every cell / re-binds per row; the engine side is negligible. pytestmark = pytest.mark.gate @pytest.fixture(scope="module") def rows_3col() -> list[tuple[int, float, str]]: - """Return parameter rows for a 3-column executemany.""" return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)] @pytest.fixture(scope="module") def scalars_wide() -> list[int]: - """Return a wide row of scalar ints for values().""" return list(range(WIDE_N)) @pytest.fixture(scope="module") def tuples_wide() -> list[tuple[int, int, int]]: - """Return a wide row of tuples for values().""" return [(i, i + 1, i + 2) for i in range(WIDE_N)] @pytest.fixture(scope="module") def dicts_wide() -> list[dict[str, int | str]]: - """Return a wide row of dicts for values().""" return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)] -# --------------------------------------------------------------------------- # -# executemany: bind + execute one parameter set per row, into a real table. -# --------------------------------------------------------------------------- # +# executemany: bind + execute one parameter set per row, into a real table (CREATE OR REPLACE so it doesn't grow). def test_ingest_executemany_3col( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, rows_3col: list[tuple[int, float, str]] ) -> None: - """Benchmark executemany INSERT of 3-column rows.""" con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) # warm @@ -77,15 +59,12 @@ def run() -> None: benchmark(run) -# --------------------------------------------------------------------------- # # values(): EAGER per-cell TransformPythonValue. Drain with fetchall to complete the round-trip. -# --------------------------------------------------------------------------- # def test_ingest_values_scalars( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, scalars_wide: list[int] ) -> None: - """Benchmark values() over a wide row of scalars.""" con.values(scalars_wide).fetchall() # warm benchmark(lambda: con.values(scalars_wide).fetchall()) @@ -93,7 +72,6 @@ def test_ingest_values_scalars( def test_ingest_values_tuples( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, tuples_wide: list[tuple[int, int, int]] ) -> None: - """Benchmark values() over a wide row of tuples.""" # each tuple cell -> LIST value (TransformPythonValue recursion) con.values(tuples_wide).fetchall() # warm benchmark(lambda: con.values(tuples_wide).fetchall()) @@ -102,7 +80,6 @@ def test_ingest_values_tuples( def test_ingest_values_dicts( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, dicts_wide: list[dict[str, int | str]] ) -> None: - """Benchmark values() over a wide row of dicts.""" # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion) con.values(dicts_wide).fetchall() # warm benchmark(lambda: con.values(dicts_wide).fetchall()) diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py index 73b99d0d..61244d2c 100644 --- a/benchmarks/test_ingest_numpy_perf.py +++ b/benchmarks/test_ingest_numpy_perf.py @@ -1,16 +1,7 @@ -"""CodSpeed benchmark: numpy ingest paths (numpy / numpy-backed pandas -> duckdb). Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_ingest_numpy_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -Covers the object-string scan (ASCII zero-copy vs transcode ladder), the NaN->NULL float loop, the masked -scan, and analyzer bind. Gotchas: the object-string benchmark MUST mix ASCII + non-ASCII + a null or it misses -the ladder; analyzer bind is the one place count(*) is correct (cost is at bind, not scan) while every other -READ aggregates over real columns. +"""numpy ingest: object-string scan, NaN-to-NULL, masked scan, analyzer bind. See benchmarks/README.md. + +Gotchas: the object-string bench MUST mix ASCII + non-ASCII + a null or it misses the transcode ladder (see +README traps); analyzer bind is the one place count(*) is correct (cost is at bind, not scan). """ from __future__ import annotations @@ -28,43 +19,37 @@ import duckdb -# env-gated (INFRA-4): scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below. +# scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below. N = scaled(500_000) ANALYZER_N = scaled(200_000) -# Registered explicitly via con.register (MEAS-3) rather than resolved by replacement-scan frame inspection. NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5} -# Mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only). +# mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only) _MIXED = ["ascii_value_", "café_", "naïve_", "日本語_", None] _MIXED_STRINGS = [None if _MIXED[i % 5] is None else f"{_MIXED[i % 5]}{i}" for i in range(N)] -# Mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind. +# mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind _MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)] - -# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()`/`sum(length())` over a registered -# frame) are engine-aggregate dominated -> informational. The analyzer BIND (count(*), no scan) is a pure -# per-bind binding cost -> gate. +# READ (sum over a registered frame) is engine-aggregate dominated -> informational. The analyzer BIND (count(*), +# no scan) is a pure per-bind binding cost -> gate. @pytest.fixture(scope="module") def df_double_with_nan() -> pd.DataFrame: - """Return a numpy-backed double frame with real NaNs.""" a = np.arange(N, dtype="float64") * 1.5 - a[::10] = np.nan # real NaNs -> NaN->NULL conversion loop + a[::10] = np.nan # real NaNs -> NaN-to-NULL conversion loop return pd.DataFrame({"a": a}) @pytest.fixture(scope="module") def df_object_string_mixed() -> pd.DataFrame: - """Return an object-string frame mixing ASCII, non-ASCII, and nulls.""" return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)}) @pytest.fixture(scope="module") def df_masked_int() -> pd.DataFrame: - """Return a nullable-Int64 frame that scans masked.""" # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask arr = pd.array(np.arange(N), dtype="Int64") arr[::10] = pd.NA @@ -73,20 +58,15 @@ def df_masked_int() -> pd.DataFrame: @pytest.fixture(scope="module") def df_object_mixed_types() -> pd.DataFrame: - """Return an object frame of mixed python types for analyzer bind.""" return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)}) -# --------------------------------------------------------------------------- # -# READ: numpy -> duckdb. Engine scans every value (sum/length force it). -# --------------------------------------------------------------------------- # +# READ: numpy -> duckdb. sum/length force a full scan. @pytest.mark.informational def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark scanning a registered numpy dict-of-arrays.""" - # MEAS-3: register explicitly (not frame-inspection replacement scan) and warm the query before measuring. - con.register("npdict", NPDICT) + con.register("npdict", NPDICT) # register explicitly, not via replacement-scan frame inspection con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall() # warm benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall()) @@ -95,9 +75,8 @@ def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDB def test_read_numpy_double_with_nan( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame ) -> None: - """Benchmark scanning a numpy double column with NaNs.""" con.register("t", df_double_with_nan) - con.execute("SELECT sum(a) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT sum(a) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) @@ -105,9 +84,8 @@ def test_read_numpy_double_with_nan( def test_read_numpy_masked_int( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame ) -> None: - """Benchmark scanning a masked nullable-int column.""" con.register("t", df_masked_int) - con.execute("SELECT sum(a) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT sum(a) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) @@ -115,23 +93,19 @@ def test_read_numpy_masked_int( def test_read_numpy_object_string_mixed( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame ) -> None: - """Benchmark scanning a mixed object-string column.""" con.register("t", df_object_string_mixed) - con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) -# --------------------------------------------------------------------------- # -# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY -- the cost is at bind, not scan, so we -# must NOT force a scan (that would drown the per-bind analyzer signal). Re-binds the object column each call. -# --------------------------------------------------------------------------- # +# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY: the cost is at bind, so forcing a scan would +# drown the per-bind signal. Re-binds the object column each call. -@pytest.mark.gate # count(*) forces no scan -> the measured cost is the PandasAnalyzer per-bind sampling (binding) +@pytest.mark.gate def test_bind_analyzer_object( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame ) -> None: - """Benchmark the analyzer bind of a mixed-type object column.""" con.register("t", df_object_mixed_types) con.execute("SELECT count(*) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall()) diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py index 168f1a3d..4edc78dc 100644 --- a/benchmarks/test_pandas_perf.py +++ b/benchmarks/test_pandas_perf.py @@ -1,15 +1,6 @@ -"""CodSpeed benchmark: pandas read/write, numpy-backed vs arrow-backed DataFrames. Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_pandas_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -The binding path depends on column backing: numpy-backed columns take the NumpyArray scan path, arrow-backed -(pandas ArrowDtype) take the near-zero-copy arrow path. Full consume: READ aggregates over real columns (not -count(*)), WRITE materializes the whole frame. +"""pandas read/write, numpy-backed vs arrow-backed frames. See benchmarks/README.md. + +Column backing selects the path: numpy-backed -> NumpyArray scan; arrow-backed (ArrowDtype) -> zero-copy arrow. """ from __future__ import annotations @@ -28,32 +19,28 @@ import duckdb -N = scaled(500_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) +N = scaled(500_000) WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" _STRINGS = [f"str_value_{i}" for i in range(N)] -# `con` fixture + threads=1 live in conftest.py. READ benchmarks (`sum()` over a registered frame) are -# engine-aggregate dominated -> informational. Only the NUMPY-backed df() WRITE is binding-dominated -> gate. -# The arrow-backed WRITE goes through to_arrow_table().to_pandas() (pyarrow library code, MEAS-2) -> informational. +# READ (sum over a registered frame) is engine-aggregate dominated -> informational. Only the NUMPY-backed df() +# WRITE is binding-dominated -> gate; the arrow-backed WRITE goes through pyarrow's to_pandas -> informational. @pytest.fixture(scope="module") def df_numpy_numeric() -> pd.DataFrame: - """Return a numpy-backed numeric frame.""" return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}) @pytest.fixture(scope="module") def df_numpy_string() -> pd.DataFrame: - """Return a numpy-backed object-string frame.""" - # explicit object dtype -> classic numpy-backed object-string column (the reworked object/analyzer path) + # explicit object dtype -> the reworked numpy-backed object-string / analyzer path return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)}) @pytest.fixture(scope="module") def df_arrow_numeric() -> pd.DataFrame: - """Return an arrow-backed numeric frame.""" return pd.DataFrame( { "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())), @@ -64,22 +51,18 @@ def df_arrow_numeric() -> pd.DataFrame: @pytest.fixture(scope="module") def df_arrow_string() -> pd.DataFrame: - """Return an arrow-backed string frame.""" return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))}) -# --------------------------------------------------------------------------- # -# READ: pandas -> duckdb. Engine scans every value (sum/length force it). -# --------------------------------------------------------------------------- # +# READ: pandas -> duckdb. sum/length force a full scan. @pytest.mark.informational def test_read_pandas_numpy_numeric( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame ) -> None: - """Benchmark scanning a numpy-backed numeric frame.""" con.register("t", df_numpy_numeric) - con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) @@ -87,9 +70,8 @@ def test_read_pandas_numpy_numeric( def test_read_pandas_numpy_string( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame ) -> None: - """Benchmark scanning a numpy-backed string frame.""" con.register("t", df_numpy_string) - con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) @@ -97,9 +79,8 @@ def test_read_pandas_numpy_string( def test_read_pandas_arrow_numeric( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame ) -> None: - """Benchmark scanning an arrow-backed numeric frame.""" con.register("t", df_arrow_numeric) - con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) @@ -107,39 +88,28 @@ def test_read_pandas_arrow_numeric( def test_read_pandas_arrow_string( benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame ) -> None: - """Benchmark scanning an arrow-backed string frame.""" con.register("t", df_arrow_string) - con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm (MEAS-3) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) -# --------------------------------------------------------------------------- # -# WRITE: duckdb -> pandas. df() is NUMPY-backed (the reworked production path); -# the arrow-backed frame goes via duckdb-arrow + pyarrow.to_pandas(ArrowDtype). -# Both eagerly materialize the whole DataFrame. -# --------------------------------------------------------------------------- # +# WRITE: duckdb -> pandas. df() is the reworked numpy-backed path; the arrow-backed frame goes via +# duckdb-arrow + pyarrow.to_pandas(ArrowDtype). Both eagerly materialize the whole frame. @pytest.mark.gate def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a numeric result to a numpy-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).df()) @pytest.mark.gate def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a string result to a numpy-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).df()) -# ADDED: the numpy-backed df() WRITE with REAL nulls -> the masked_array build + masked->pd.NA rewrite that the -# cutover reworked (a no-null column takes the cheap std::move path and would measure the wrong thing), plus a -# datetime column (TimestampConvert + ConvertDateTimeTypes). - - @pytest.mark.gate def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a null-heavy numeric result to a numpy-backed frame.""" + # REAL nulls -> the masked_array build + masked-to-pd.NA rewrite the cutover reworked (see README traps) q = ( "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)" @@ -149,18 +119,15 @@ def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: @pytest.mark.gate def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a timestamp result to a numpy-backed frame.""" q = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)" benchmark(lambda: con.sql(q).df()) -@pytest.mark.informational # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2) +@pytest.mark.informational # to_pandas() half is pyarrow library code def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a numeric result to an arrow-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) -@pytest.mark.informational # to_arrow_table().to_pandas() -> the to_pandas half is pyarrow library code (MEAS-2) +@pytest.mark.informational # to_pandas() half is pyarrow library code def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark materializing a string result to an arrow-backed frame.""" benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py index eb54f91c..faa053ac 100644 --- a/benchmarks/test_produce_numpy_perf.py +++ b/benchmarks/test_produce_numpy_perf.py @@ -1,15 +1,6 @@ -"""CodSpeed benchmark: columnar produce paths (df(), fetchnumpy(), fetch_df_chunk()). Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_produce_numpy_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -Covers the with-NULLS branch (masked_array build), datetime, and wide-internal types (hugeint/uuid/decimal128). -Gotcha: NULL benchmarks use real DuckDB nulls (CASE WHEN); a no-null column takes the cheap path and measures -the wrong thing. Full consume: df()/fetchnumpy() materialize the columns; fetch_df_chunk is drained in a loop. +"""Columnar produce: df(), fetchnumpy(), fetch_df_chunk(), per type, null vs no-null. See benchmarks/README.md. + +Covers the with-NULLS masked_array branch, datetime, and wide-internal types (hugeint/uuid/decimal128). """ from __future__ import annotations @@ -23,12 +14,12 @@ from _scale import scaled import duckdb -import numpy as np # noqa: F401 (pinned identically A/B; imported so the env matches the other modules) +import numpy as np # noqa: F401 (pinned identically A/B so the env matches the other modules) if TYPE_CHECKING: from pytest_codspeed import BenchmarkFixture -N = scaled(500_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) +N = scaled(500_000) TYPE_N = scaled(200_000) # wide-internal types (hugeint/uuid/decimal128) are heavier per cell Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" @@ -43,8 +34,7 @@ Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)" -# gate: df()/fetchnumpy() fully materialize numpy-backed columns -> binding-dominated (ArrayWrapper fill), -# GIL-held, deterministic under Callgrind. `con` fixture + threads=1 live in conftest.py. +# gate: df()/fetchnumpy() fully materialize numpy-backed columns (ArrayWrapper fill, binding-dominated). def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: con.sql(query).df() # warm benchmark(lambda: con.sql(query).df()) @@ -55,80 +45,59 @@ def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, qu benchmark(lambda: con.sql(query).fetchnumpy()) -# --------------------------------------------------------------------------- # -# df(): the production NUMPY-backed columnar path. no-null vs REAL-null vs string vs timestamp. -# --------------------------------------------------------------------------- # +# df(): the production numpy-backed columnar path. no-null vs REAL-null vs string vs timestamp vs wide types. @pytest.mark.gate def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a numeric result.""" _bench_df(benchmark, con, Q_NUM) @pytest.mark.gate def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a null-heavy numeric result.""" - # REAL nulls -> HAS_NULLS=true -> masked_array build + masked->pd.NA rewrite (the reworked branch) - _bench_df(benchmark, con, Q_NUM_NULLS) + _bench_df(benchmark, con, Q_NUM_NULLS) # REAL nulls -> masked_array branch (see README traps) @pytest.mark.gate def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a string result.""" _bench_df(benchmark, con, Q_STR) @pytest.mark.gate def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a timestamp result.""" _bench_df(benchmark, con, Q_TS) @pytest.mark.gate def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a hugeint result.""" _bench_df(benchmark, con, Q_HUGEINT) @pytest.mark.gate def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a uuid result.""" _bench_df(benchmark, con, Q_UUID) @pytest.mark.gate def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark df() of a 128-bit decimal result.""" _bench_df(benchmark, con, Q_DEC128) -# --------------------------------------------------------------------------- # -# fetchnumpy(): same FetchNumpyInternal without the DataFrame wrap. -# --------------------------------------------------------------------------- # +# fetchnumpy(): same FetchNumpyInternal, without the DataFrame wrap. @pytest.mark.gate def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchnumpy() of a numeric result.""" _bench_numpy(benchmark, con, Q_NUM) @pytest.mark.gate def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark fetchnumpy() of a null-heavy numeric result.""" _bench_numpy(benchmark, con, Q_NUM_NULLS) -# --------------------------------------------------------------------------- # -# fetch_df_chunk(): per-chunk DataFrame production, drained in a loop. -# --------------------------------------------------------------------------- # - - -@pytest.mark.informational # per-chunk streaming drain (GIL-per-chunk) -> walltime-informational, not gated +@pytest.mark.informational # per-chunk streaming drain (GIL-per-chunk), not gated def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark draining a result with fetch_df_chunk().""" - def run() -> int: rel = con.sql(Q_NUM) rows = 0 @@ -143,39 +112,23 @@ def run() -> int: benchmark(run) -# --------------------------------------------------------------------------- # -# torch(): FetchNumpyInternal + per-column from_numpy. SKIPPED cleanly if torch is absent (identical A/B). -# --------------------------------------------------------------------------- # - - -@pytest.mark.informational # torch is local-only (importorskip -> skipped in CI); torch lib work dilutes it +@pytest.mark.informational # torch is local-only (importorskip); torch lib work dilutes it def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark torch() of a numeric result (skipped if torch is absent).""" pytest.importorskip("torch") q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)" con.sql(q).torch() # warm benchmark(lambda: con.sql(q).torch()) -# --------------------------------------------------------------------------- # -# MEMORY GUARD (secondary signal, not a codspeed benchmark). codspeed walltime tracks neither memory nor -# allocations, and conversion regressions are often memory-shaped (the recorded fetchall list->tuple edge-copy; -# the df() masked_array branch). We use tracemalloc to capture the PEAK Python-tracked allocation of ONE -# df()-with-nulls call. Correctness notes: -# * reset_peak() is called AFTER the warm (and after freeing the warm result) so the warm does not establish -# a high-water mark that swallows the measured call -- the prior getrusage(ru_maxrss) version was broken -# precisely because ru_maxrss is monotonic and the warm pre-set the peak, making the delta ~0. -# * tracemalloc reports BYTES on every platform (no macOS-bytes / Linux-KiB skew that the getrusage version -# had), so the ceiling is portable to the Linux CI target. -# CAVEAT: tracemalloc only sees Python-level allocations; the raw numpy column buffers are allocated in C and -# are NOT visible here. So this catches a gross PYTHON-object-shaped blowup (the masked->pd.NA rewrite / a -# per-row object materialization regression) but is not a total-RSS gate -- the authoritative CI gate for the -# C-buffer payload is codspeed memory mode (--codspeed-mode=memory). -# --------------------------------------------------------------------------- # +# Memory guard (secondary signal, not a codspeed benchmark; codspeed walltime tracks neither memory nor allocs). +# tracemalloc captures the PEAK Python-tracked allocation of ONE df()-with-nulls call. reset_peak() runs AFTER +# the warm so the warm does not set a high-water mark that swallows the measured call. tracemalloc reports bytes +# on every platform (portable to Linux CI). CAVEAT: it only sees Python-level allocs, not the C numpy buffers, so +# it catches a gross Python-object blowup (masked-to-pd.NA gone wrong) but is not a total-RSS gate; that is +# codspeed memory mode's job (deferred, see PLAN.md). def test_mem_df_with_nulls() -> None: - """Guard the Python-tracked peak allocation of a null-heavy df() call.""" con = duckdb.connect(config={"threads": 1}) try: tracemalloc.start() @@ -190,7 +143,6 @@ def test_mem_df_with_nulls() -> None: finally: con.close() print(f"\n[mem] df()-with-nulls tracemalloc peak = {peak / 1e6:.1f} MB", file=sys.stderr) - # Python-tracked allocations for a 500k x 2-col masked df are a few MB; a gross conversion-memory blowup - # (e.g. a per-row Python object list, the masked->pd.NA rewrite gone wrong) is tens+ MB. 100 MB ceiling - # catches that without flaking, and is bytes on all platforms. + # a 500k x 2-col masked df is a few MB of Python-tracked allocs; a gross blowup is tens+ MB. 100 MB ceiling + # catches that without flaking. assert peak < 100_000_000 diff --git a/benchmarks/test_relational_construction_perf.py b/benchmarks/test_relational_construction_perf.py index 5b386da5..bd494c2e 100644 --- a/benchmarks/test_relational_construction_perf.py +++ b/benchmarks/test_relational_construction_perf.py @@ -1,17 +1,8 @@ -"""CodSpeed benchmark: relational-API expression construction. Standalone, not in CI's binding gate. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_relational_construction_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -SCOPE: this is relational-API *construction* (ColumnExpression / ConstantExpression / operator overloads), -NOT the binding-pressure surface the rest of the suite targets. It was moved here out of test_fetch_perf.py -(MEAS-5) because it is out of scope for the binding-pressure gate. It is KEPT because it carries a real signal -(a measured ~35% expression-construction delta at the cutover), so it stays visible -- but it is marked -`informational`, so it runs and reports and is NEVER part of the gate. +"""Relational-API expression construction. Informational, out of the binding gate. See benchmarks/README.md. + +This is expression *construction* (ColumnExpression / ConstantExpression / operator overloads), not the +binding-pressure surface the rest of the suite targets. Kept because it carries a real signal (a measured ~35% +construction delta at the cutover), but never part of the gate. """ from __future__ import annotations @@ -25,13 +16,10 @@ if TYPE_CHECKING: from pytest_codspeed import BenchmarkFixture -# informational: relational-API construction, deliberately excluded from the binding-pressure gate (MEAS-5). pytestmark = pytest.mark.informational def test_expr_many(benchmark: BenchmarkFixture) -> None: - """Benchmark building many column/constant expressions.""" - def run() -> int: out = [] for i in range(2000): diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py index f0323fea..2d3ae95d 100644 --- a/benchmarks/test_types_roundtrip_perf.py +++ b/benchmarks/test_types_roundtrip_perf.py @@ -1,16 +1,7 @@ -"""CodSpeed benchmark: the type x direction produce matrix (fetchall / df / to_arrow per type). Standalone, not in CI. - -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_types_roundtrip_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -One logical type per column across three directions, so a regression localizes to (type, direction). Includes -the wide types the narrow-numeric benchmarks miss: hugeint, uuid, decimal128, long varchar. Note: to_arrow on a -materialized result re-runs the query with the GIL released, so the arrow column is engine-parallel and -walltime-noisy: informational, not a hard gate. +"""type x direction produce matrix: fetchall / df / to_arrow per logical type. See benchmarks/README.md. + +One logical type per column across three directions, so a regression localizes to (type, direction). Includes the +wide types the narrow-numeric benches miss: hugeint, uuid, decimal128, long varchar. """ from __future__ import annotations @@ -25,7 +16,7 @@ import duckdb -N = scaled(100_000) # env-gated: full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep (INFRA-4) +N = scaled(100_000) # one logical type per column; long-varchar is intentionally > 64 chars TYPE_EXPR = { @@ -46,34 +37,29 @@ TYPES = list(TYPE_EXPR) -# `con` fixture + threads=1 live in conftest.py. def _query(type_name: str) -> str: return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)" -@pytest.mark.gate # OUT-row fetchall -> binding-dominated per-type dispatch +@pytest.mark.gate # OUT-row: binding-dominated per-type dispatch @pytest.mark.parametrize("type_name", TYPES) def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: - """Benchmark fetchall of one logical type per column.""" q = _query(type_name) con.execute(q).fetchall() # warm benchmark(lambda: con.execute(q).fetchall()) -@pytest.mark.gate # OUT-col df() -> binding-dominated ArrayWrapper fill per type +@pytest.mark.gate # OUT-col: binding-dominated ArrayWrapper fill per type @pytest.mark.parametrize("type_name", TYPES) def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: - """Benchmark df() of one logical type per column.""" q = _query(type_name) con.sql(q).df() # warm benchmark(lambda: con.sql(q).df()) -@pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated +@pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel, noisy) -> not gated @pytest.mark.parametrize("type_name", TYPES) def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: - """Benchmark to_arrow_table() of one logical type per column (informational only).""" - # informational only: PromoteMaterializedToArrow re-runs the query with the GIL released (noisy) q = _query(type_name) con.sql(q).to_arrow_table() # warm benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py index a62be815..0f381ca7 100644 --- a/benchmarks/test_udf_perf.py +++ b/benchmarks/test_udf_perf.py @@ -1,14 +1,6 @@ -"""CodSpeed benchmark: Python UDF paths (native scalar + vectorized arrow). Standalone, not in CI. +"""Python UDFs: native scalar (one call per row) and vectorized arrow (one call per chunk). See benchmarks/README.md. -A/B: run under each build, compare (data libs pinned identically, so the delta is the binding): - cd /Users/evert/projects/duckdb-python/wt-codspeed - for P in ../main/.venv-release/bin/python .venv-release/bin/python; do \ - $P -m pytest benchmarks/test_udf_perf.py \ - --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider; \ - done - -Native scalar = one Python call per row (arg build + PyObject_CallObject + result transform); arrow = one call -per chunk. Full consume: each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row. +Each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row. """ from __future__ import annotations @@ -28,46 +20,39 @@ pa = pytest.importorskip("pyarrow") pc = pytest.importorskip("pyarrow.compute") -# env-gated (INFRA-4): full N locally, shrunk under BENCH_SCALE in the CI Callgrind sweep. NATIVE_N = scaled(200_000) # native = one Python call per row, keep moderate ARROW_N = scaled(1_000_000) # arrow = one Python call per chunk (vectorized), can be large -# `con` fixture + threads=1 live in conftest.py. def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: - con.execute(query).fetchall() # warm the engine + import caches before measuring + con.execute(query).fetchall() # warm the engine + import caches benchmark(lambda: con.execute(query).fetchall()) -# --------------------------------------------------------------------------- # -# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result). -# --------------------------------------------------------------------------- # +# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result). The Python +# call dominates; the sum() consume is negligible -> gate. -@pytest.mark.gate # native scalar UDF: one Python call per row dominates; the sum() consume is negligible +@pytest.mark.gate def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark a 1-arg native int scalar UDF.""" con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)") @pytest.mark.gate def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark a 2-arg native int scalar UDF.""" con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT) _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)") @pytest.mark.gate def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark a 1-arg native double scalar UDF.""" con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE) _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)") @pytest.mark.gate def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark a native string scalar UDF.""" con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR) _bench( benchmark, @@ -78,9 +63,8 @@ def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConn @pytest.mark.gate def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark the validity short-circuit for NULL inputs to a native UDF.""" - # DEFAULT null handling: NULL inputs short-circuit (SetNull) WITHOUT calling the UDF -- this measures the - # validity short-circuit, not the Python call, so the UDF only ever sees non-NULL rows. + # DEFAULT null handling short-circuits NULL inputs (SetNull) WITHOUT calling the UDF: measures the validity + # short-circuit, so the UDF only ever sees non-NULL rows. con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) _bench( benchmark, @@ -90,30 +74,26 @@ def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBP ) -# --------------------------------------------------------------------------- # -# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast. -# --------------------------------------------------------------------------- # +# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast. pyarrow lib +# work + per-chunk conversion + 1M engine -> informational. -@pytest.mark.informational # vectorized arrow UDF: pyarrow.compute lib work + per-chunk conversion + 1M engine +@pytest.mark.informational def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark a vectorized arrow int UDF.""" con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)") @pytest.mark.informational def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark a vectorized arrow double UDF.""" con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow") _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)") @pytest.mark.informational def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: - """Benchmark the selvec compaction for NULL inputs to a vectorized arrow UDF.""" - # DEFAULT null handling on the vectorized path: the binding compacts the validity (selvec) before the call - # and reconstructs the result vector afterwards -- this is the selvec compaction/reconstruction cost. + # DEFAULT null handling on the vectorized path compacts the validity (selvec) before the call and reconstructs + # the result vector after: this measures the selvec compaction/reconstruction cost. con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") _bench( benchmark, diff --git a/pyproject.toml b/pyproject.toml index 90218094..12cad096 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -281,10 +281,8 @@ test = [ # dependencies used for running tests "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'", "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'", ] -bench = [ # minimal, pinned deps for the CodSpeed benchmark suite (.github/workflows/codspeed.yml). Deliberately - # NOT the heavy `test` group (no torch/tensorflow/pyspark/adbc). Pinned via uv.lock and kept in lockstep - # with any baseline compared against, so the only cross-run delta is the binding. Constraints mirror the - # `test` group so the lockfile resolves identically. torch/tf produce paths are local-only (importorskip). +bench = [ # Pinned deps for the benchmark suite (see benchmarks/README.md). Minimal, not the heavy `test` group. + # Constraints mirror `test` so the lockfile resolves identically; torch/tf are local-only (importorskip). "pytest", "pytest_codspeed", "polars>=1.33.0", @@ -457,6 +455,10 @@ strict = true # No need for type hinting in tests 'ANN001', 'ANN201', 'ANN202' ] +"benchmarks/**.py" = [ + # benchmarks are test-like: docstrings optional (shared context lives in benchmarks/README.md) + 'D100', 'D101', 'D102', 'D103', 'D104', 'D105', 'D107', +] "tests/fast/spark/**.py" = [ "E402" ] diff --git a/tests/fast/test_binding_pressure_leak.py b/tests/fast/test_binding_pressure_leak.py index 22de87b2..1ffd596c 100644 --- a/tests/fast/test_binding_pressure_leak.py +++ b/tests/fast/test_binding_pressure_leak.py @@ -1,12 +1,8 @@ -"""Sustained-iteration leak guards for the binding object-pinning paths (COV-3). +"""Sustained-iteration leak guards for the binding object-pinning paths. -Sibling of test_relation_dependency_leak.py. CodSpeed measures steady-state PER-CALL cost and structurally cannot -see a per-call refcount imbalance in the object-pinning graph (ExternalDependency / registered_objects / a UDF's -retained Python callable) until it OOMs. This is a plain assertion test (NOT a codspeed benchmark, no marker): it -runs each pinning path N times and asserts RSS and Python-object growth stay flat. - -Covers the paths the existing leak test does not: register/unregister, native + arrow UDF create/run/remove, and -executemany. (from_arrow/from_df/replacement-scan pinning is already covered by test_relation_dependency_leak.py.) +CodSpeed measures per-call cost and can't see a refcount imbalance in the object-pinning graph until it OOMs, so +this plain assertion test runs each pinning path N times and asserts RSS and object growth stay flat. Covers what +test_relation_dependency_leak.py does not: register/unregister, native + arrow UDF create/run/remove, executemany. """ import gc From 527755617ac4a843679b0101ae5c949442cb032f Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Thu, 2 Jul 2026 09:13:29 +0200 Subject: [PATCH 6/7] fix bugs, add baseline --- .github/workflows/codspeed.yml | 9 +- benchmarks/baseline.json | 972 ++++++++++++++++++++++++ benchmarks/test_produce_numpy_perf.py | 4 +- benchmarks/test_types_roundtrip_perf.py | 11 +- 4 files changed, 989 insertions(+), 7 deletions(-) create mode 100644 benchmarks/baseline.json diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index b80323fc..2e14ebf5 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -80,10 +80,13 @@ jobs: run: | # step 1: build deps only (needed for --no-build-isolation), no project uv sync --only-group build --no-install-project -p 3.13 - # step 2: build+install the project (release) + build group, without the heavy default `dev` group - uv sync --no-build-isolation --no-editable --reinstall --no-default-groups --group build -p 3.13 - # step 3: the frozen bench pins (exact ==), so the only cross-run delta is the binding + # step 2: the frozen bench pins (exact ==), so the only cross-run delta is the binding. MUST precede the + # build: numpy>=2.0 is a [build-system].requires (numpy C API headers), which --no-build-isolation does + # not auto-install and which is not in the `build` group, so CMake's find_package(... NumPy) fails first. uv pip install -r benchmarks/requirements-bench.txt + # step 3: build+install the project (release), no default `dev` group (torch/tensorflow/pyspark). uv pip + # install is additive; uv sync here would prune numpy back out before the build and re-break the config. + uv pip install --no-build-isolation --no-deps --reinstall -C cmake.build-type=Release . - name: Collect gate node-ids # the gate/informational marker split; regen uses it to classify each benchmark diff --git a/benchmarks/baseline.json b/benchmarks/baseline.json new file mode 100644 index 00000000..fe809300 --- /dev/null +++ b/benchmarks/baseline.json @@ -0,0 +1,972 @@ +{ + "meta": { + "schema_version": 1, + "generated_at_utc": "2026-07-02T06:26:46+00:00", + "git_commit": "090e02142b1bca4163c526ad75a4dcc84a5ae374", + "duckdb_submodule_sha": "d9a775e4c03b23ecb3784f879196aa81adf0ac1c", + "requirements_bench_sha256": "2bdfd6a766947a61559afb2799c54f0ea173b9325f55082ad809bf7b97b2c659", + "measurement": { + "tool": "valgrind callgrind", + "event": "Ir", + "pythonhashseed": "0" + }, + "bench_scale": "10", + "gate_default_threshold_pct": 5.0, + "binding_fraction_cutoff": 0.25, + "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above." + }, + "benchmarks": { + "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[1000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 13968509, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[2]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 13117509, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[50000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 17445483, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 7507078, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_reader_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 8566385, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 16952462, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_arrow_reader_consumed": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29404937, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_arrow_table_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29199115, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_arrow_table_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 25884569, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_polars_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29363771, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_polars_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29278882, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[10000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 49732326, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[1000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 32634030, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[100]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 31130130, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[20000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 68677642, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[10000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 61656223, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[1000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 32870219, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[100]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 30241645, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[20000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 93837059, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[10000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 41073162, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[1000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31192384, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[100]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 30319144, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[20000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 51996785, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_100k": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 3255412, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_200k": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 3253716, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_small": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 2855767, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_2col_500k": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31312283, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_fetch_perf.py::test_fetchall_2int": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 30527833, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_2int_gate": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 5144687, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_blob": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 46799205, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_decimal128": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 234831861, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_double": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 28100940, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_int": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18885980, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_int_gate": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 3207318, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_mixed": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 298310717, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_mixed_wide": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 629847376, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_null_heavy": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18497920, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_smallint": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18158437, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_str": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 36630015, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_timestamptz": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 442013591, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchmany_batched": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 44376635, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchone_iter": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 56082286, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_executemany_3col": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 20508999651, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_values_dicts": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 6300053057, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_values_scalars": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 4364660696, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_values_tuples": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 5224666337, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_numpy_perf.py::test_bind_analyzer_object": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21109327, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_dict_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5698722, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_double_with_nan": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 4441652, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_masked_int": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 4427922, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_object_string_mixed": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 71135312, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5978439, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 16958452, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 6253482, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31577228, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31316827, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 27977539, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 29474196, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric_with_nulls": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40398312, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_string": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 69326603, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_timestamp": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21747493, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_decimal128": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 12498891, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_hugeint": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 7060301, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_numeric": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 29464799, + "binding_fraction": 0.0, + "threshold_pct": null + }, + "benchmarks/test_produce_numpy_perf.py::test_df_numeric_with_nulls": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40357060, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_string": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 69304377, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_timestamp": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21738267, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_uuid": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 215063593, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_fetch_df_chunk_loop": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 43497043, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 28165468, + "binding_fraction": 0.0, + "threshold_pct": null + }, + "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric_with_nulls": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 35144943, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_relational_construction_perf.py::test_expr_many": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 64025731, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[bool]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 3639613, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[date]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5143666, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal128]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11654375, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal64]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 6088232, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[double]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 12398027, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[hugeint]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 6319959, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[int64]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 2512782, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[list]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11014392, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[struct]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5119483, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[timestamp]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 10337048, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[uuid]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11291045, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_long]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 20944198, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_short]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11322686, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 3638394, + "binding_fraction": 0.1053, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 6410855, + "binding_fraction": 0.4922, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal128]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 12496882, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal64]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 6410024, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 12732237, + "binding_fraction": 0.7443, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[hugeint]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 7054469, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 2718974, + "binding_fraction": 0.0, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[list]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 91324470, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[struct]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 110991217, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[timestamp]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 10647333, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[uuid]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 215166204, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_long]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40038336, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_short]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 28326808, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[bool]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 17981967, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[date]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 23701642, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal128]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 234148728, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal64]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21656881, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[double]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 28070587, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[hugeint]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 159982348, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[int64]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18836658, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[list]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 150499447, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[struct]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 119062526, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[timestamp]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 30750748, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[uuid]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 226484384, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_long]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 49637213, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_short]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 33743613, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_arrow_double": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 102838074, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_udf_perf.py::test_udf_arrow_int": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 56453572, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_udf_perf.py::test_udf_arrow_null_inputs": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 72729269, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_udf_perf.py::test_udf_native_double_1arg": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40772497, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_int_1arg": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 35374345, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_int_2arg": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 48207658, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_null_inputs": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 26901535, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_string": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 93255939, + "binding_fraction": null, + "threshold_pct": 5.0 + } + } +} diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py index faa053ac..f7a103da 100644 --- a/benchmarks/test_produce_numpy_perf.py +++ b/benchmarks/test_produce_numpy_perf.py @@ -48,7 +48,7 @@ def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, qu # df(): the production numpy-backed columnar path. no-null vs REAL-null vs string vs timestamp vs wide types. -@pytest.mark.gate +@pytest.mark.informational def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: _bench_df(benchmark, con, Q_NUM) @@ -86,7 +86,7 @@ def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnecti # fetchnumpy(): same FetchNumpyInternal, without the DataFrame wrap. -@pytest.mark.gate +@pytest.mark.informational def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: _bench_numpy(benchmark, con, Q_NUM) diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py index 2d3ae95d..9cc8d6b3 100644 --- a/benchmarks/test_types_roundtrip_perf.py +++ b/benchmarks/test_types_roundtrip_perf.py @@ -36,6 +36,14 @@ } TYPES = list(TYPE_EXPR) +# OUT-col bool/int64 are engine-diluted below the Option-B cutoff (binding_fraction < 0.25, see baseline.json): the +# numpy column fill is trivial next to the engine scan, so they are informational while the other types stay gate. +# OUT-row is unaffected (fetchall builds a Python object per cell, binding-dominated for every type). +_OUT_COL_DILUTED = {"bool", "int64"} +_OUT_COL_PARAMS = [ + pytest.param(t, marks=pytest.mark.informational if t in _OUT_COL_DILUTED else pytest.mark.gate) for t in TYPES +] + def _query(type_name: str) -> str: return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)" @@ -49,8 +57,7 @@ def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConne benchmark(lambda: con.execute(q).fetchall()) -@pytest.mark.gate # OUT-col: binding-dominated ArrayWrapper fill per type -@pytest.mark.parametrize("type_name", TYPES) +@pytest.mark.parametrize("type_name", _OUT_COL_PARAMS) # OUT-col: ArrayWrapper fill; gate per type except diluted ones def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: q = _query(type_name) con.sql(q).df() # warm From 79b5e6d8763413d298d8596672501937a14337f1 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Thu, 2 Jul 2026 09:23:24 +0200 Subject: [PATCH 7/7] remove cp311 from seeds --- .github/workflows/packaging_wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml index 7a3bb74a..96da6227 100644 --- a/.github/workflows/packaging_wheels.yml +++ b/.github/workflows/packaging_wheels.yml @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - python: [ cp311, cp314 ] + python: [ cp314 ] platform: - { os: windows-2022, arch: amd64, cibw_system: win } - { os: windows-11-arm, arch: ARM64, cibw_system: win }