diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml new file mode 100644 index 00000000..2e14ebf5 --- /dev/null +++ b/.github/workflows/codspeed.yml @@ -0,0 +1,129 @@ +# Instruction-count (Callgrind) perf-regression gate against a COMMITTED baseline. No CodSpeed account/token/runner: +# compare_baseline.py parses raw callgrind dumps and diffs each benchmark against benchmarks/baseline.json. Counts +# are near-deterministic with PYTHONHASHSEED pinned (~0.1% noise), so the 5% gate threshold sits far above it. +# Details + rationale: benchmarks/README.md and benchmarks/PLAN.md. +# +# Triggers: nightly schedule + manual workflow_dispatch (no pull_request/push). A dispatch on a feature branch +# compares that branch's counts vs the baseline.json committed on it, answering "did my branch regress vs main". +# +# Modes (workflow_dispatch input `regen`): +# regen=false (default) -> COMPARE + report. Report-only for now (never fails); flip to --enforce once trusted. +# regen=true -> write a fresh baseline.json + upload as an artifact to commit deliberately. Bump +# requirements-bench.txt FIRST (separate commit) if the pins should change. +# +# The concurrency module is excluded from the sweep (Callgrind serializes threads, so its signal is meaningless). +# Memory mode (a second sweep for produce peak-RSS) is deferred (see PLAN.md). + +name: Benchmarks + +on: + schedule: + - cron: "0 3 * * *" # nightly at 03:00 UTC + workflow_dispatch: + inputs: + regen: + description: "Regenerate benchmarks/baseline.json (upload as artifact) instead of comparing" + type: boolean + default: false + +concurrency: + group: codspeed-${{ github.ref }} + cancel-in-progress: true + +jobs: + benchmarks: + runs-on: ubuntu-latest + timeout-minutes: 90 # ~25 min sweep at BENCH_SCALE=10 (12-core Linux) + ~10 min cold build; margin for CI + permissions: + contents: read + env: + PYTHONHASHSEED: "0" # stable instruction counts for dict/struct paths + CODSPEED_ENV: "1" # activates pytest-codspeed's instrument hooks + # shrink the O(rows) benches so the sweep fits under timeout-minutes. Local runs leave this unset -> full N. + # Recorded in baseline.json meta.bench_scale; a baseline only compares to a run at the SAME scale. + BENCH_SCALE: "10" + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive # the DuckDB engine submodule is needed to build + fetch-depth: 0 # setuptools_scm needs history for version detection + + - name: Resolve DuckDB submodule SHA + id: duckdb_sha + # used for the sccache key AND passed to compare_baseline.py for the engine-bump guard + run: echo "sha=$(git rev-parse HEAD:external/duckdb)" >> "$GITHUB_OUTPUT" + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.13" + + - name: Install valgrind + run: sudo apt-get update && sudo apt-get install -y valgrind + + - name: Cache sccache + uses: actions/cache@v4 + with: + path: ~/.cache/sccache + key: sccache-codspeed-${{ steps.duckdb_sha.outputs.sha }} + restore-keys: sccache-codspeed- + + - name: Install sccache + run: | + curl -fsSL https://github.com/mozilla/sccache/releases/download/v0.8.2/sccache-v0.8.2-x86_64-unknown-linux-musl.tar.gz \ + | tar -xz --strip-components=1 -C /usr/local/bin sccache-v0.8.2-x86_64-unknown-linux-musl/sccache + + - name: Build the extension (release) + pinned benchmark deps + env: + CMAKE_C_COMPILER_LAUNCHER: sccache + CMAKE_CXX_COMPILER_LAUNCHER: sccache + run: | + # step 1: build deps only (needed for --no-build-isolation), no project + uv sync --only-group build --no-install-project -p 3.13 + # step 2: the frozen bench pins (exact ==), so the only cross-run delta is the binding. MUST precede the + # build: numpy>=2.0 is a [build-system].requires (numpy C API headers), which --no-build-isolation does + # not auto-install and which is not in the `build` group, so CMake's find_package(... NumPy) fails first. + uv pip install -r benchmarks/requirements-bench.txt + # step 3: build+install the project (release), no default `dev` group (torch/tensorflow/pyspark). uv pip + # install is additive; uv sync here would prune numpy back out before the build and re-break the config. + uv pip install --no-build-isolation --no-deps --reinstall -C cmake.build-type=Release . + + - name: Collect gate node-ids + # the gate/informational marker split; regen uses it to classify each benchmark + run: uv run --no-sync pytest benchmarks/ -m gate --collect-only -q -o addopts= -p no:cacheprovider \ + | grep '::' > gate_list.txt || true + + - name: Run benchmarks under Callgrind (per-benchmark instruction counts) + # ONE sweep over gate+informational EXCEPT the concurrency module (thread-serialized, expensive). Each + # benchmark emits a callgrind dump keyed by its uri. + run: | + mkdir -p profiles + CODSPEED_PROFILE_FOLDER="$PWD/profiles" valgrind --tool=callgrind --instr-atstart=no \ + --callgrind-out-file="$PWD/profiles/cg.%p.%n" \ + uv run --no-sync pytest benchmarks/ \ + --ignore=benchmarks/test_concurrency_perf.py \ + -m "gate or informational" --codspeed -o addopts= -p no:cacheprovider + + - name: Compare against committed baseline (report-only) + if: ${{ !inputs.regen }} + # report-only: prints the delta table, never fails the job. Add --enforce once trusted. + run: | + uv run --no-sync python benchmarks/compare_baseline.py compare \ + --profiles profiles --baseline benchmarks/baseline.json \ + --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \ + --pins benchmarks/requirements-bench.txt + + - name: Regenerate baseline (upload artifact to commit deliberately) + if: ${{ inputs.regen }} + run: | + uv run --no-sync python benchmarks/compare_baseline.py regen \ + --profiles profiles --out benchmarks/baseline.json --gate-list gate_list.txt \ + --git-commit "${{ github.sha }}" --submodule-sha "${{ steps.duckdb_sha.outputs.sha }}" \ + --pins benchmarks/requirements-bench.txt + + - name: Upload regenerated baseline + if: ${{ inputs.regen }} + uses: actions/upload-artifact@v4 + with: + name: baseline-update + path: benchmarks/baseline.json diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml index 7a3bb74a..96da6227 100644 --- a/.github/workflows/packaging_wheels.yml +++ b/.github/workflows/packaging_wheels.yml @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - python: [ cp311, cp314 ] + python: [ cp314 ] platform: - { os: windows-2022, arch: amd64, cibw_system: win } - { os: windows-11-arm, arch: ARM64, cibw_system: win } diff --git a/benchmarks/PLAN.md b/benchmarks/PLAN.md new file mode 100644 index 00000000..835aef7d --- /dev/null +++ b/benchmarks/PLAN.md @@ -0,0 +1,90 @@ +# Benchmark suite plan + +Design rationale for the binding micro-benchmarks. The suite is implemented in `benchmarks/`; CI lives in +`../.github/workflows/codspeed.yml`; conventions, markers, and the two data-pattern traps are in +[README.md](README.md). + +Priority: **P0** = known-regression or cutover-reworked path (narrow-numeric common case); **P1** = high-traffic +conversion or per-element Python work; **P2** = correctness-relevant, lower-traffic or engine-dominated. + +## Scenarios + +PRODUCE (duckdb to Python) is the highest regression risk: `Fetchone` builds a `TupleBuilder` per row and calls +`FromValue` per cell (O(rows x cols), the shape of the historical ~15% fetchall regression). + +- **OUT-row** (`test_fetch_perf`, `test_types_roundtrip_perf`): fetchall / fetchone / fetchmany per type. P0 + narrow numeric; P1 varchar, list, struct, and the expensive per-row types (decimal `Decimal()`, timestamptz + pytz, hugeint string round-trip, uuid). Small-N `*_gate` probes isolate the compile+fetch fixed cost. +- **OUT-col** (`test_produce_numpy_perf`): df() / fetchnumpy() reworked columnar path. P0 numeric no-null vs + REAL-null (the masked_array branch); plus string, timestamp, and wide-internal (hugeint/uuid/decimal128). +- **OUT-arrow / polars** (`test_arrow_perf`): to_arrow_table / reader / pl(). Informational (engine-parallel, + GIL-released). +- **Cardinality** (`test_cardinality_perf`): a LIMIT-n sweep giving a clean per-row conversion slope. + +INGEST (Python to duckdb): + +- **numpy / pandas** (`test_ingest_numpy_perf`, `test_pandas_perf`): numpy-backed scan (NaN-to-NULL, masked), + object-string transcode ladder, arrow-backed zero-copy, and the per-bind PandasAnalyzer. +- **arrow** (`test_arrow_perf`): Table + RecordBatchReader + dictionary sweep. +- **native** (`test_ingest_native_perf`): values() list/tuple/dict per-cell TransformPythonValue, executemany. + +UDF (`test_udf_perf`, zero coverage before this suite): native scalar per-row (P0, the biggest untested per-call +path) and vectorized arrow per-chunk. + +## Type x direction matrix + +Directions: IN-native (TransformPythonValue), IN-numpy (NumpyScan), OUT-row (FromValue), OUT-col (ArrayWrapper), +OUT-arrow. + +| Type | IN-native | IN-numpy | OUT-row | OUT-col | OUT-arrow | +|------|-----------|----------|---------|---------|-----------| +| int32/int64 | P1 | **P0** | **P0** | **P0** | P1 | +| double | P1 | **P0** (NaN->NULL) | P0 | P0 | P1 | +| varchar | P1 | **P0** (PyUnicode) | P1 | P1 | P1 | +| bool | P2 | P1 | P2 | P1 | P2 | +| decimal64/128 | P2 | n/a | **P1** (Python Decimal) | P1 | P2 | +| date | P2 | P1 | P1 | P1 | P2 | +| timestamp(tz) | P1 | P1 | **P1** (pytz/row) | P1 | P1 | +| LIST/STRUCT | P2 | P2 | P1 (recursive) | P1 | P2 | +| hugeint/uuid | P2 | P2 | **P1** (round-trip) | P1 | P2 | +| blob/map | P2 | P2 | P2 | P2 | P2 | +| NULL-heavy | n/a | **P1** | P2 | **P0** (masked_array) | P1 | + +## Mechanics + +- **Walltime vs instruction-count.** Local A/B is walltime only (no Valgrind on macOS arm64). CI is + instruction-count via self-hosted Callgrind (near-deterministic, PYTHONHASHSEED pinned), diffed against a + committed baseline. Report-only until trusted. +- **Marker split + auto-move.** Every benchmark is `gate` or `informational` (see README). At baseline regen, + each numeric-produce gate's binding fraction `= 1 - floor_Ir / bench_Ir` is computed against its engine floor + (`test_engine_control_perf`); a gate below the ~25% cutoff is auto-moved to informational (a threshold on an + engine-diluted total is not meaningful). OUT-row fetch and UDFs are ~all binding; numeric produce is a bulk + memcpy of ~engine magnitude (auto-move candidate). +- **Guards.** compare_baseline.py warns and stops enforcing when BENCH_SCALE, the pin file, or the DuckDB + submodule SHA differ from the baseline's (any of those makes the counts non-comparable). +- **Sustained-leak guard** (`tests/fast/test_binding_pressure_leak.py`): a plain RSS + object-count test for the + object-pinning paths, since a per-call refcount imbalance is invisible to a steady-state benchmark. +- **Memory mode** (a second Callgrind sweep for O(rows) produce peak-RSS) is designed but deferred; the + `test_mem_df_with_nulls` tracemalloc guard is the local stand-in. + +## Cross-check vs iqmo-org/bareduckdb + +Their suite is a SQL-file-driven A/B comparing two clients (production `duckdb` vs the C-API prototype), arrow-in +/ arrow-out only, no fetchall/df/numpy/native/UDF coverage. So our binding suite is far broader; their genuine +deltas concentrate in PRODUCE/types. Actionable additions they suggest: + +- **hugeint / uuid in the produce matrix** (they select both): OUT-row does a per-value string round-trip, distinct + from narrow int. Now in `test_produce_numpy_perf` / `test_fetch_perf`. +- **int128-internal decimal** (`DECIMAL(28,x)`) alongside the int64-internal one: hits a wider cast path. Added. +- **heterogeneous mixed-type row**: exercises per-cell type dispatch in the Fetchone loop, unlike homogeneous + columns. Added as `test_fetchall_mixed_wide`. +- **long varchar (>64 char)** alongside the short string: shifts string copy / transcode toward copy-bound. Added + as `varchar_long` in the matrix. +- **result-cardinality (top-N) sweep**: holds engine work ~constant while sweeping rows-to-Python. Adopted as + `test_cardinality_perf` (plain LIMIT, no ORDER BY; the sort swamped the signal). +- **peak-memory guard** on the O(rows) produce paths: a conversion regression is often memory-shaped. Partially + covered by the tracemalloc guard; full coverage waits on memory mode. + +Out of scope (theirs, not adopted): pure-engine filter/group/window workloads; 100M+ row scale (IO/engine +dominated); the free-threading category (unsupported by this client). Do NOT adopt their no-warmup single-run +methodology (charges import-cache population into the measurement). diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 00000000..ca8f8355 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,34 @@ +# Benchmark suite + +CodSpeed micro-benchmarks for the binding hot paths (produce, ingest, UDF). +Design rationale: [PLAN.md](PLAN.md). CI: [../.github/workflows/codspeed.yml](../.github/workflows/codspeed.yml). + +## Markers + +Every benchmark carries exactly one (registered in `conftest.py`): + +- **gate**: binding-dominated, GIL-held, deterministic under Callgrind. A threshold breach is a binding regression. +- **informational**: engine/library/streaming-diluted. Reported, never gated (would false-positive on engine bumps). + +## Local A/B (walltime) + +Only walltime runs locally (no Valgrind on macOS arm64; instruction-count gating is Linux/CI-only, and walltime is +noisy on sub-ms benches). Pin the data libs identically across both builds so the delta is pure binding: + +```bash +for P in ../main/.venv-release/bin/python .venv-release/bin/python; do + $P -m pytest benchmarks/.py --codspeed --codspeed-mode=walltime -o addopts= -p no:cacheprovider +done +``` + +## Conventions + +- READ aggregates real columns (`sum`/`length`), never `count(*)` (answered from metadata). +- WRITE fully materializes the result or drains the lazy reader. +- Warm once before measuring. +- `con` fixture pins `threads=1` (see `conftest.py`). + +Two traps (a benchmark that skips these silently measures the wrong thing): + +- OUT-col null benches need REAL nulls (`CASE WHEN ... THEN NULL`), else the cheap `std::move` path is taken. +- IN-numpy string benches need mixed ASCII + non-ASCII + a null sentinel, else the transcode/null ladder is skipped. diff --git a/benchmarks/_scale.py b/benchmarks/_scale.py new file mode 100644 index 00000000..a4049aa9 --- /dev/null +++ b/benchmarks/_scale.py @@ -0,0 +1,35 @@ +"""Env-gated row-count scaling for the benchmark suite. + +Callgrind is 20-50x, so the O(rows) benches at full N make the CI sweep too slow. `scaled(n)` shrinks row counts +ONLY when `BENCH_SCALE=` is set (which the CI sweep sets); unset -> full N, so local walltime A/B is +unchanged. A gate bench and the engine floor it is compared against share a base N, so routing BOTH through +`scaled()` keeps them at an identical scaled N and the binding fraction stays valid. Scaling reduces row counts +only; it must never change the data patterns the benches depend on (real nulls, mixed ASCII, LIMIT-no-ORDER-BY). +A floor keeps a scaled bench row-dominated so per-element work still dominates; the small-N `*_gate` probes are +already fast and are NOT scaled. +""" + +from __future__ import annotations + +import os + +FLOOR = 20_000 # a scaled bench never drops below this (stays row-dominated, ~10x the range(2048) probes) + + +def bench_scale() -> int: + """Return the divisor from `BENCH_SCALE` (>=1); 1 (no scaling) if unset/invalid.""" + v = os.environ.get("BENCH_SCALE") + if not v: + return 1 + try: + return max(int(v), 1) + except ValueError: + return 1 + + +def scaled(n: int) -> int: + """Return `n` at full scale, or `max(n // BENCH_SCALE, min(n, FLOOR))` when scaling is enabled.""" + d = bench_scale() + if d <= 1: + return n + return max(n // d, min(n, FLOOR)) diff --git a/benchmarks/baseline.json b/benchmarks/baseline.json new file mode 100644 index 00000000..fe809300 --- /dev/null +++ b/benchmarks/baseline.json @@ -0,0 +1,972 @@ +{ + "meta": { + "schema_version": 1, + "generated_at_utc": "2026-07-02T06:26:46+00:00", + "git_commit": "090e02142b1bca4163c526ad75a4dcc84a5ae374", + "duckdb_submodule_sha": "d9a775e4c03b23ecb3784f879196aa81adf0ac1c", + "requirements_bench_sha256": "2bdfd6a766947a61559afb2799c54f0ea173b9325f55082ad809bf7b97b2c659", + "measurement": { + "tool": "valgrind callgrind", + "event": "Ir", + "pythonhashseed": "0" + }, + "bench_scale": "10", + "gate_default_threshold_pct": 5.0, + "binding_fraction_cutoff": 0.25, + "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above." + }, + "benchmarks": { + "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[1000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 13968509, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[2]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 13117509, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_dictionary[50000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 17445483, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 7507078, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_reader_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 8566385, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_read_arrow_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 16952462, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_arrow_reader_consumed": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29404937, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_arrow_table_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29199115, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_arrow_table_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 25884569, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_polars_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29363771, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_arrow_perf.py::test_write_polars_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 29278882, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[10000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 49732326, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[1000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 32634030, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[100]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 31130130, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_df[20000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 68677642, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[10000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 61656223, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[1000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 32870219, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[100]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 30241645, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_fetchall[20000]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 93837059, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[10000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 41073162, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[1000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31192384, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[100]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 30319144, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_cardinality_perf.py::test_limit_to_arrow[20000]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 51996785, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_100k": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 3255412, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_200k": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 3253716, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_1col_small": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 2855767, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_engine_control_perf.py::test_engine_sum_2col_500k": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31312283, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_fetch_perf.py::test_fetchall_2int": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 30527833, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_2int_gate": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 5144687, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_blob": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 46799205, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_decimal128": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 234831861, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_double": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 28100940, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_int": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18885980, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_int_gate": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 3207318, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_mixed": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 298310717, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_mixed_wide": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 629847376, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_null_heavy": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18497920, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_smallint": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18158437, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_str": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 36630015, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchall_timestamptz": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 442013591, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchmany_batched": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 44376635, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_fetch_perf.py::test_fetchone_iter": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 56082286, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_executemany_3col": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 20508999651, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_values_dicts": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 6300053057, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_values_scalars": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 4364660696, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_native_perf.py::test_ingest_values_tuples": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 5224666337, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_numpy_perf.py::test_bind_analyzer_object": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21109327, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_dict_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5698722, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_double_with_nan": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 4441652, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_masked_int": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 4427922, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_ingest_numpy_perf.py::test_read_numpy_object_string_mixed": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 71135312, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5978439, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_arrow_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 16958452, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 6253482, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_read_pandas_numpy_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31577228, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_numeric": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 31316827, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_arrow_string": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 27977539, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 29474196, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_numeric_with_nulls": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40398312, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_string": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 69326603, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_pandas_perf.py::test_write_pandas_numpy_timestamp": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21747493, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_decimal128": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 12498891, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_hugeint": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 7060301, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_numeric": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 29464799, + "binding_fraction": 0.0, + "threshold_pct": null + }, + "benchmarks/test_produce_numpy_perf.py::test_df_numeric_with_nulls": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40357060, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_string": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 69304377, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_timestamp": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21738267, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_df_uuid": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 215063593, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_produce_numpy_perf.py::test_fetch_df_chunk_loop": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 43497043, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 28165468, + "binding_fraction": 0.0, + "threshold_pct": null + }, + "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric_with_nulls": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 35144943, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_relational_construction_perf.py::test_expr_many": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 64025731, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[bool]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 3639613, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[date]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5143666, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal128]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11654375, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[decimal64]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 6088232, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[double]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 12398027, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[hugeint]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 6319959, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[int64]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 2512782, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[list]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11014392, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[struct]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 5119483, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[timestamp]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 10337048, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[uuid]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11291045, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_long]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 20944198, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_arrow_table[varchar_short]": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 11322686, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 3638394, + "binding_fraction": 0.1053, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 6410855, + "binding_fraction": 0.4922, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal128]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 12496882, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[decimal64]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 6410024, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 12732237, + "binding_fraction": 0.7443, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[hugeint]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 7054469, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": { + "marker": "informational", + "source_marker": "gate", + "auto_moved": true, + "instructions": 2718974, + "binding_fraction": 0.0, + "threshold_pct": null + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[list]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 91324470, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[struct]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 110991217, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[timestamp]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 10647333, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[uuid]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 215166204, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_long]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40038336, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[varchar_short]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 28326808, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[bool]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 17981967, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[date]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 23701642, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal128]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 234148728, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[decimal64]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 21656881, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[double]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 28070587, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[hugeint]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 159982348, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[int64]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 18836658, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[list]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 150499447, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[struct]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 119062526, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[timestamp]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 30750748, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[uuid]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 226484384, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_long]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 49637213, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_types_roundtrip_perf.py::test_out_row_fetchall[varchar_short]": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 33743613, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_arrow_double": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 102838074, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_udf_perf.py::test_udf_arrow_int": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 56453572, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_udf_perf.py::test_udf_arrow_null_inputs": { + "marker": "informational", + "source_marker": "informational", + "auto_moved": false, + "instructions": 72729269, + "binding_fraction": null, + "threshold_pct": null + }, + "benchmarks/test_udf_perf.py::test_udf_native_double_1arg": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 40772497, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_int_1arg": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 35374345, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_int_2arg": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 48207658, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_null_inputs": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 26901535, + "binding_fraction": null, + "threshold_pct": 5.0 + }, + "benchmarks/test_udf_perf.py::test_udf_native_string": { + "marker": "gate", + "source_marker": "gate", + "auto_moved": false, + "instructions": 93255939, + "binding_fraction": null, + "threshold_pct": 5.0 + } + } +} diff --git a/benchmarks/compare_baseline.py b/benchmarks/compare_baseline.py new file mode 100644 index 00000000..ab9773f9 --- /dev/null +++ b/benchmarks/compare_baseline.py @@ -0,0 +1,322 @@ +#!/usr/bin/env python3 +"""Committed-baseline instruction-count comparison for the benchmark suite. See benchmarks/README.md. + +pytest-codspeed's hooks call `callgrind_dump_stats_at()` per benchmark, so callgrind writes ONE dump each, +headed by `desc: Trigger: Client Request: ` with the count on `totals:` (`events: Ir`). This parses those +raw dumps directly (no CodSpeed account/token/runner). Run-to-run noise is ~0.1%, so the 5% gate threshold sits +far above it (PYTHONHASHSEED pinned in CI). + +Two modes (CI-only; no valgrind on macOS arm64): + regen: write baseline.json from a fresh run: counts + provenance + binding fractions + auto-move. + compare: diff a fresh run against baseline.json. Gate benches over threshold are regressions; informational + are reported only. Report-only by default; `--enforce` exits non-zero on a gate regression. + +baseline.json and benchmarks/requirements-bench.txt are regenerated together so counts match the frozen pins. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import re +import sys +from datetime import datetime, timezone +from pathlib import Path + +SCHEMA_VERSION = 1 +GATE_DEFAULT_THRESHOLD_PCT = 5.0 +BINDING_FRACTION_CUTOFF = 0.25 # a gate whose isolable binding fraction is below this is auto-moved to +# informational (a threshold on its engine-diluted total is not meaningful). + +# Floor map: the engine-control bench that is the "engine floor" of a numeric-produce gate. +# binding_fraction = 1 - floor_Ir / bench_Ir. ONLY numeric-produce benches are listed (their per-element binding +# is a bulk memcpy of ~engine magnitude); every other gate is high-binding and needs no fraction. Add a mapping +# (and, if needed, a floor) to evaluate more benches. +_E = "benchmarks/test_engine_control_perf.py" +FLOOR_MAP = { + "benchmarks/test_produce_numpy_perf.py::test_df_numeric": f"{_E}::test_engine_sum_2col_500k", + "benchmarks/test_produce_numpy_perf.py::test_fetchnumpy_numeric": f"{_E}::test_engine_sum_2col_500k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[int64]": f"{_E}::test_engine_sum_1col_100k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[double]": f"{_E}::test_engine_sum_1col_100k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[bool]": f"{_E}::test_engine_sum_1col_100k", + "benchmarks/test_types_roundtrip_perf.py::test_out_col_df[date]": f"{_E}::test_engine_sum_1col_100k", +} + +_TRIGGER_RE = re.compile(r"^desc:\s*Trigger:\s*Client Request:\s*(?P.+?)\s*$") +_TOTALS_RE = re.compile(r"^totals:\s*(?P\d+)\s*$") + + +# --------------------------------------------------------------------------- # +# callgrind parsing +# --------------------------------------------------------------------------- # + + +def _normalize_uri(raw: str) -> str: + """Return a repo-relative benchmark key (strip a leading absolute path if the run was outside a git repo).""" + raw = raw.strip() + if "::" not in raw: + return raw + path, _, rest = raw.partition("::") + idx = path.find("benchmarks/") + if idx > 0: + path = path[idx:] + return f"{path}::{rest}" + + +def parse_profiles(profile_dir: Path) -> dict[str, int]: + """Parse every callgrind dump in `profile_dir`; return {benchmark_uri: instruction_count}. + + Keeps only dumps whose Trigger is a benchmark Client Request (contains `::`); skips metadata/termination + dumps. If a uri appears more than once (should not happen) the max is kept. + """ + counts: dict[str, int] = {} + files = sorted(profile_dir.rglob("*")) if profile_dir.exists() else [] + for f in files: + if not f.is_file(): + continue + uri: str | None = None + ir: int | None = None + try: + text = f.read_text(errors="replace") + except (OSError, UnicodeError): + continue + for line in text.splitlines(): + m = _TRIGGER_RE.match(line) + if m: + uri = _normalize_uri(m.group("uri")) + continue + m = _TOTALS_RE.match(line) + if m: + ir = int(m.group("ir")) + if uri and "::" in uri and ir is not None: + counts[uri] = max(counts.get(uri, 0), ir) + return counts + + +# --------------------------------------------------------------------------- # +# helpers +# --------------------------------------------------------------------------- # + + +def _sha256(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() if path.exists() else "" + + +def _load_gate_set(gate_list: Path | None) -> set[str]: + """Load the set of gate benchmark uris from a `pytest -m gate --collect-only -q` node-id list.""" + if not gate_list or not gate_list.exists(): + return set() + out = set() + for raw in gate_list.read_text().splitlines(): + line = raw.strip() + if "::" in line: # a pytest node-id (the workflow pre-filters the collect-only output to '::' lines) + out.add(_normalize_uri(line)) + return out + + +def _pct(base: int, new: int) -> float: + return 0.0 if base == 0 else (new - base) / base * 100.0 + + +# --------------------------------------------------------------------------- # +# regen +# --------------------------------------------------------------------------- # + + +def regen(args: argparse.Namespace) -> int: + """Write baseline.json from a valgrind run: counts + provenance + Option-B binding fractions/auto-move.""" + counts = parse_profiles(Path(args.profiles)) + if not counts: + print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr) + return 2 + gate_set = _load_gate_set(Path(args.gate_list) if args.gate_list else None) + + benches: dict[str, dict] = {} + auto_moved: list[str] = [] + for uri, ir in sorted(counts.items()): + source_marker = "gate" if uri in gate_set else "informational" + marker = source_marker + binding_fraction = None + floor_uri = FLOOR_MAP.get(uri) + if source_marker == "gate" and floor_uri and floor_uri in counts and ir > 0: + binding_fraction = round(max(0.0, 1.0 - counts[floor_uri] / ir), 4) + if binding_fraction < args.cutoff: + marker = "informational" # Option-B auto-move: engine-diluted, threshold not meaningful + auto_moved.append(uri) + benches[uri] = { + "marker": marker, + "source_marker": source_marker, + "auto_moved": marker != source_marker, + "instructions": ir, + "binding_fraction": binding_fraction, + "threshold_pct": GATE_DEFAULT_THRESHOLD_PCT if marker == "gate" else None, + } + + baseline = { + "meta": { + "schema_version": SCHEMA_VERSION, + "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "git_commit": args.git_commit, + "duckdb_submodule_sha": args.submodule_sha, + "requirements_bench_sha256": _sha256(Path(args.pins)) if args.pins else "", + "measurement": {"tool": "valgrind callgrind", "event": "Ir", "pythonhashseed": "0"}, + "bench_scale": os.environ.get("BENCH_SCALE", ""), # counts are only comparable at the same scale + "gate_default_threshold_pct": GATE_DEFAULT_THRESHOLD_PCT, + "binding_fraction_cutoff": args.cutoff, + "noise_note": "callgrind Ir observed ~0.1% run-to-run; gate threshold set well above.", + }, + "benchmarks": benches, + } + Path(args.out).write_text(json.dumps(baseline, indent=2) + "\n") + n_gate = sum(1 for b in benches.values() if b["marker"] == "gate") + n_info = len(benches) - n_gate + print(f"Wrote {args.out}: {len(benches)} benchmarks ({n_gate} gate, {n_info} informational).") + if auto_moved: + print(f"Option-B auto-moved {len(auto_moved)} engine-diluted gate(s) to informational:") + for uri in auto_moved: + print(f" {uri} (binding_fraction={benches[uri]['binding_fraction']})") + print("Recommend updating these benches' @pytest.mark.gate -> informational so code matches the baseline.") + return 0 + + +# --------------------------------------------------------------------------- # +# compare +# --------------------------------------------------------------------------- # + + +def compare(args: argparse.Namespace) -> int: + """Diff a fresh valgrind run against baseline.json and print a report (report-only unless --enforce).""" + new_counts = parse_profiles(Path(args.profiles)) + if not new_counts: + print(f"ERROR: no benchmark dumps found under {args.profiles}", file=sys.stderr) + return 2 + baseline_path = Path(args.baseline) + if not baseline_path.exists(): + # Bootstrap state: no committed baseline yet. Report the run and instruct to regenerate; never fail. + print(f"No baseline at {baseline_path} yet -- run the workflow with regen=true to create it.") + print(f"This run produced {len(new_counts)} benchmark instruction counts.") + return 0 + baseline = json.loads(baseline_path.read_text()) + meta = baseline.get("meta", {}) + base_benches = baseline.get("benchmarks", {}) + + # scale guard: a baseline built at BENCH_SCALE=X is only comparable to a run at the same scale. + run_scale = os.environ.get("BENCH_SCALE", "") + base_scale = meta.get("bench_scale", "") + if run_scale != base_scale: + print( + f"WARNING: BENCH_SCALE differs (run={run_scale!r}, baseline={base_scale!r}) -> instruction counts are " + "not comparable. Regenerate the baseline at this scale." + ) + + # pin-drift guard: the baseline's counts only compare cleanly against the pinned data libs it was built with. + if args.pins: + cur = _sha256(Path(args.pins)) + base_pins = meta.get("requirements_bench_sha256", "") + if cur and base_pins and cur != base_pins: + print( + "WARNING: benchmarks/requirements-bench.txt differs from the baseline's pins -> data-lib deltas " + "may not be pure binding. Regenerate the baseline with the current pins." + ) + + # engine-bump guard: engine-inclusive counts shift when the DuckDB submodule changes. If the SHA differs from + # the baseline's, don't treat gate deltas as hard failures (they may reflect the bump); warn to regenerate. + engine_changed = bool( + args.submodule_sha and meta.get("duckdb_submodule_sha") and args.submodule_sha != meta["duckdb_submodule_sha"] + ) + + regressions: list[str] = [] + rows: list[tuple[str, str, str]] = [] # (status, uri, detail) + for uri, ir in sorted(new_counts.items()): + b = base_benches.get(uri) + if b is None: + rows.append(("NEW", uri, f"{ir} Ir (no baseline)")) + continue + base_ir = b["instructions"] + delta = _pct(base_ir, ir) + marker = b.get("marker", "informational") + thr = b.get("threshold_pct") or GATE_DEFAULT_THRESHOLD_PCT + detail = f"{base_ir} -> {ir} Ir ({delta:+.2f}%, thr {thr:.1f}%, {marker})" + if marker == "gate" and delta > thr: + if engine_changed: + rows.append(("ENGINE?", uri, detail + " [submodule changed -> not enforced]")) + else: + rows.append(("REGRESSION", uri, detail)) + regressions.append(uri) + else: + rows.append(("ok" if marker == "gate" else "info", uri, detail)) + rows.extend( + ("MISSING", uri, "in baseline, absent from run (rename/removal?)") + for uri in sorted(set(base_benches) - set(new_counts)) + ) + + _print_report(meta, rows, engine_changed=engine_changed, enforce=args.enforce) + + if not args.enforce: + return 0 + if engine_changed: + print("\nNOT ENFORCING: DuckDB submodule differs from the baseline; regenerate the baseline.") + return 0 + return 1 if regressions else 0 + + +def _print_report(meta: dict, rows: list[tuple[str, str, str]], *, engine_changed: bool, enforce: bool) -> None: + mode = "ENFORCING" if enforce else "REPORT-ONLY (not failing the job)" + print("=" * 100) + print(f"CodSpeed instruction-count baseline comparison [{mode}]") + print( + f"baseline: commit {meta.get('git_commit', '?')[:12]} submodule {str(meta.get('duckdb_submodule_sha'))[:12]}" + f" generated {meta.get('generated_at_utc', '?')}" + ) + if engine_changed: + print( + "WARNING: DuckDB submodule SHA differs from the baseline -> engine-inclusive deltas may reflect the " + "engine bump, not the binding. Regenerate the baseline for this engine." + ) + print("=" * 100) + order = {"REGRESSION": 0, "ENGINE?": 1, "MISSING": 2, "NEW": 3, "ok": 4, "info": 5} + for status, uri, detail in sorted(rows, key=lambda r: (order.get(r[0], 9), r[1])): + print(f" [{status:>10}] {uri}\n {detail}") + n_reg = sum(1 for s, _, _ in rows if s == "REGRESSION") + print("-" * 100) + print(f"Summary: {len(rows)} benchmarks, {n_reg} gate regression(s)" + ("" if enforce else " (report-only)")) + + +# --------------------------------------------------------------------------- # +# cli +# --------------------------------------------------------------------------- # + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point: dispatch to the `regen` or `compare` subcommand.""" + p = argparse.ArgumentParser(description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + + r = sub.add_parser("regen", help="write baseline.json from a valgrind run") + r.add_argument("--profiles", required=True, help="CODSPEED_PROFILE_FOLDER with callgrind dumps") + r.add_argument("--out", default="benchmarks/baseline.json") + r.add_argument("--gate-list", help="file of gate node-ids (pytest -m gate --collect-only -q)") + r.add_argument("--git-commit", default="") + r.add_argument("--submodule-sha", default="") + r.add_argument("--pins", default="benchmarks/requirements-bench.txt") + r.add_argument("--cutoff", type=float, default=BINDING_FRACTION_CUTOFF) + r.set_defaults(func=regen) + + c = sub.add_parser("compare", help="compare a valgrind run against baseline.json") + c.add_argument("--profiles", required=True) + c.add_argument("--baseline", default="benchmarks/baseline.json") + c.add_argument("--submodule-sha", default="") + c.add_argument( + "--pins", default="benchmarks/requirements-bench.txt", help="warn if pins differ from the baseline's" + ) + c.add_argument("--enforce", action="store_true", help="exit non-zero on a gate regression (default: report-only)") + c.set_defaults(func=compare) + + args = p.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py new file mode 100644 index 00000000..07cba4e4 --- /dev/null +++ b/benchmarks/conftest.py @@ -0,0 +1,42 @@ +"""Shared fixtures + marker registration for the benchmark suite. See benchmarks/README.md. + +Markers are registered here (not via pyproject `markers=`) because pyproject sets `filterwarnings = ["error"]`, +so an unregistered mark would raise as a collection error. Every benchmark must carry EXACTLY ONE of `gate` / +`informational` so the two CI steps (`-m gate`, `-m informational`) cover the suite with no overlap. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import bench_scale, scaled # noqa: F401 (re-exported as the shared home; used by the modules) + +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + +def pytest_configure(config: pytest.Config) -> None: + """Register the gate/informational markers (required under filterwarnings=error).""" + config.addinivalue_line( + "markers", + "gate: binding-dominated, instruction-count gate-able under Callgrind (deterministic).", + ) + config.addinivalue_line( + "markers", + "informational: engine/library-diluted or streaming; reported, never gated.", + ) + + +@pytest.fixture +def con() -> Iterator[duckdb.DuckDBPyConnection]: + """Yield a fresh single-threaded connection, closed on teardown. + + `threads=1` pins engine parallelism so counts/walltime don't shift with the runner core count. The + concurrency module overrides this deliberately. + """ + c = duckdb.connect(config={"threads": 1}) + yield c + c.close() diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt new file mode 100644 index 00000000..8a9f49e6 --- /dev/null +++ b/benchmarks/requirements-bench.txt @@ -0,0 +1,23 @@ +# Frozen pins for the benchmark suite: freezing the data libs means the only cross-run delta is the binding. +# Regenerate DELIBERATELY, together with the baseline. Source of truth: the `[dependency-groups] bench` list in +# pyproject.toml (torch/tensorflow deliberately absent, local-only via importorskip). Regenerate with: +# uv pip compile pyproject.toml --group bench \ +# --python-version 3.13 --python-platform x86_64-unknown-linux-gnu \ +# --no-annotate --no-header -o benchmarks/requirements-bench.txt +iniconfig==2.3.0 +markdown-it-py==4.2.0 +mdurl==0.1.2 +numpy==2.5.0 +packaging==26.2 +pandas==3.0.3 +pluggy==1.6.0 +polars==1.42.1 +polars-runtime-32==1.42.1 +pyarrow==24.0.0 +pygments==2.20.0 +pytest==9.1.1 +pytest-codspeed==5.0.3 +python-dateutil==2.9.0.post0 +pytz==2026.2 +rich==15.0.0 +six==1.17.0 diff --git a/benchmarks/test_arrow_perf.py b/benchmarks/test_arrow_perf.py new file mode 100644 index 00000000..de05f78e --- /dev/null +++ b/benchmarks/test_arrow_perf.py @@ -0,0 +1,134 @@ +"""Arrow read/write: Table + RecordBatchReader + dictionary sweep. See benchmarks/README.md. + +READ aggregates over real columns (arrow answers count(*) from metadata); WRITE drains the lazy reader. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyarrow as pa +import pytest +from _scale import scaled + +import numpy as np + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +N = scaled(500_000) +DICT_UNIQUE = [2, 1_000, 50_000] # UNIQUE-value counts (cardinality sweep), not row counts -> NOT scaled +WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" +WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" + +# informational: every bench here is engine-parallel or library/streaming dominated. READ = engine aggregate +# dominates; WRITE (to_arrow/pl) re-runs the query GIL-released. Would trip on engine/submodule bumps, not binding. +pytestmark = pytest.mark.informational + + +@pytest.fixture(scope="module") +def arrow_numeric() -> pa.Table: + return pa.table( + { + "a": pa.array(range(N), type=pa.int64()), + "b": pa.array([i * 1.5 for i in range(N)], type=pa.float64()), + } + ) + + +@pytest.fixture(scope="module") +def arrow_string() -> pa.Table: + return pa.table({"s": pa.array([f"str_value_{i}" for i in range(N)], type=pa.string())}) + + +@pytest.fixture(scope="module") +def arrow_numeric_batches(arrow_numeric: pa.Table) -> tuple[pa.Schema, list[pa.RecordBatch]]: + # RecordBatches are immutable/re-readable, so a fresh reader can be built from them every round + return arrow_numeric.schema, arrow_numeric.to_batches(max_chunksize=50_000) + + +@pytest.fixture(scope="module") +def arrow_dict_tables() -> dict[int, pa.Table]: + # deterministic indices (i % U) so the instruction count is reproducible (no PRNG) + tables = {} + for u in DICT_UNIQUE: + uniques = pa.array([f"category_value_{i}" for i in range(u)], type=pa.string()) + idx = pa.array(np.arange(N, dtype="int32") % u, type=pa.int32()) + tables[u] = pa.table({"c": pa.DictionaryArray.from_arrays(idx, uniques)}) + return tables + + +# READ: arrow -> duckdb. sum/length force a full scan. + + +def test_read_arrow_numeric( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_numeric: pa.Table +) -> None: + con.register("t_num", arrow_numeric) + con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t_num").fetchall()) + + +def test_read_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_string: pa.Table) -> None: + con.register("t_str", arrow_string) + con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t_str").fetchall()) + + +def test_read_arrow_reader_numeric( + benchmark: BenchmarkFixture, + con: duckdb.DuckDBPyConnection, + arrow_numeric_batches: tuple[pa.Schema, list[pa.RecordBatch]], +) -> None: + # same factory as the Table read, but STREAMING: a fresh reader per round, drained by the engine + schema, batches = arrow_numeric_batches + + def run() -> list: + reader = pa.RecordBatchReader.from_batches(schema, iter(batches)) + con.register("t_rdr", reader) + return con.execute("SELECT sum(a), sum(b) FROM t_rdr").fetchall() + + run() # warm + benchmark(run) + + +@pytest.mark.parametrize("unique", DICT_UNIQUE) +def test_read_arrow_dictionary( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, arrow_dict_tables: dict[int, pa.Table], unique: int +) -> None: + # per-value dictionary DECODE cost slopes with the unique count (mirrors core test_arrow_dictionaries_scan) + con.register("t_dict", arrow_dict_tables[unique]) + con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(c), sum(length(c)) FROM t_dict").fetchall()) + + +# WRITE: duckdb -> arrow, consumer fully materializes / drains the stream. + + +def test_write_arrow_table_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table()) + + +def test_write_arrow_table_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table()) + + +def test_write_arrow_reader_consumed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + def run() -> int: + reader = con.sql(WRITE_Q_NUM).to_arrow_reader(100_000) + rows = 0 + for batch in reader: # drain the lazy stream so duckdb produces every batch + rows += batch.num_rows + return rows + + benchmark(run) + + +def test_write_polars_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_NUM).pl()) + + +def test_write_polars_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_STR).pl()) diff --git a/benchmarks/test_cardinality_perf.py b/benchmarks/test_cardinality_perf.py new file mode 100644 index 00000000..751c6cf5 --- /dev/null +++ b/benchmarks/test_cardinality_perf.py @@ -0,0 +1,64 @@ +"""Result-cardinality (rows-to-Python) sweep via LIMIT n, no ORDER BY. See benchmarks/README.md. + +`SELECT * FROM src LIMIT n` early-stops the scan, so per-row conversion dominates and the slope is monotone in n. +A steeper slope on one build is a per-row conversion regression. n=100 is overhead, n=100_000 is throughput. +(An ORDER BY version was dropped: the top-N sort swamped the signal.) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +import duckdb + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pytest_codspeed import BenchmarkFixture + +# scale the source rows AND the top-N by the same factor, keeping small-N points fixed and SRC_ROWS >= max(LIMITS). +SRC_ROWS = scaled(200_000) +LIMITS = [100, 1_000, 10_000, scaled(100_000)] + + +@pytest.fixture(scope="module") +def con() -> Iterator[duckdb.DuckDBPyConnection]: + # source materialized ONCE (module-scoped) and identical across the n sweep; per-test build would add noise + c = duckdb.connect(config={"threads": 1}) + c.execute( + "CREATE TABLE src AS " + f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b, ('s_' || i) AS s FROM range({SRC_ROWS}) t(i)" + ) + yield c + c.close() + + +def _query(n: int) -> str: + return f"SELECT a, b, s FROM src LIMIT {n}" + + +@pytest.mark.gate # fetchall materializes n rows -> binding-dominated; small-n end is the noise-free gate +@pytest.mark.parametrize("n", LIMITS) +def test_limit_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: + q = _query(n) + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + + +@pytest.mark.gate # df() materializes n rows to numpy columns -> binding-dominated +@pytest.mark.parametrize("n", LIMITS) +def test_limit_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: + q = _query(n) + con.sql(q).df() # warm + benchmark(lambda: con.sql(q).df()) + + +@pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel) -> not gated +@pytest.mark.parametrize("n", LIMITS) +def test_limit_to_arrow(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, n: int) -> None: + q = _query(n) + con.sql(q).to_arrow_table() # warm + benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_concurrency_perf.py b/benchmarks/test_concurrency_perf.py new file mode 100644 index 00000000..c55b0274 --- /dev/null +++ b/benchmarks/test_concurrency_perf.py @@ -0,0 +1,111 @@ +"""Concurrency / GIL pressure across thread counts. Walltime-only, never gated. See benchmarks/README.md. + +The ONE dimension the single-threaded rest of the suite cannot see: Python objects threading through PARALLEL +core execution. Primary signal is LOCAL WALLTIME: + * scan benches -> parallel speedup; a per-batch Produce GIL regression shows as reduced speedup. + * native UDF -> ~flat scaling = the GIL tax on per-row Python calls. + * arrow UDF -> observed NEGATIVE scaling (per-chunk convert + GIL contention). + +Under CI Callgrind threads are serialized, so wall-clock contention is invisible there; the deterministic count +still captures per-batch Produce GIL calls + UDF dispatch. Never gated either way. + +GOTCHA: a SINGLE-BATCH arrow table does NOT parallelize (one batch = one serial scan unit). The arrow scan bench +MUST use a MULTI-BATCH table AND a CPU-heavy aggregate (a cheap sum is bandwidth-bound and won't parallelize). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import duckdb +from duckdb.sqltypes import BIGINT + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + +pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") +import numpy as np # noqa: E402 (after importorskip, matching the suite convention) +import pandas as pd # noqa: E402 + +pytestmark = pytest.mark.informational + +N_SCAN = 1_000_000 +BATCH = 20_000 # -> 50 record batches; MULTI-BATCH required for the arrow scan to parallelize (see GOTCHA) +N_UDF_NATIVE = 200_000 # native UDF = one Python call per row; keep modest (Callgrind instruments every call) +N_UDF_ARROW = 1_000_000 # arrow UDF = one call per chunk (vectorized) +THREADS = [1, 4, 8] + +# CPU-heavy aggregate so the parallel scan engages worker threads. The binding signal is the per-batch Produce +# GIL handoff. +HEAVY = "sin(a) * cos(b) + sqrt(abs(a)) + ln(abs(a) + 1)" + + +@pytest.fixture(scope="module") +def arrow_multibatch() -> pa.Table: + a = pa.array(np.arange(N_SCAN), type=pa.int64()) + b = pa.array(np.arange(N_SCAN, dtype="float64") * 1.5, type=pa.float64()) + return pa.Table.from_batches(pa.table({"a": a, "b": b}).to_batches(max_chunksize=BATCH)) + + +@pytest.fixture(scope="module") +def pandas_frame() -> pd.DataFrame: + return pd.DataFrame({"a": np.arange(N_SCAN), "b": np.arange(N_SCAN, dtype="float64") * 1.5}) + + +# Parallel SCAN: arrow batches / pandas chunks pulled through the binding by engine worker threads; the scan +# Produce acquires/releases the GIL per batch across threads. + + +@pytest.mark.parametrize("threads", THREADS) +def test_scan_arrow_parallel(benchmark: BenchmarkFixture, arrow_multibatch: pa.Table, threads: int) -> None: + con = duckdb.connect(config={"threads": threads}) + try: + con.register("t", arrow_multibatch) + q = f"SELECT sum({HEAVY}) FROM t" + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + finally: + con.close() + + +@pytest.mark.parametrize("threads", THREADS) +def test_scan_pandas_parallel(benchmark: BenchmarkFixture, pandas_frame: pd.DataFrame, threads: int) -> None: + con = duckdb.connect(config={"threads": threads}) + try: + con.register("t", pandas_frame) + q = f"SELECT sum({HEAVY}) FROM t" + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + finally: + con.close() + + +# Parallel UDF: the engine scans a MATERIALIZED table (range() does not parallelize) and invokes a Python UDF +# from multiple worker threads. Native = per-row call under the GIL (GIL tax); arrow = per-chunk convert. + + +@pytest.mark.parametrize("threads", THREADS) +def test_udf_native_parallel(benchmark: BenchmarkFixture, threads: int) -> None: + con = duckdb.connect(config={"threads": threads}) + try: + con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_NATIVE}) s(i)") # materialized -> parallel scan + con.create_function("pyf", lambda x: (x * 2 + 1) % 97, [BIGINT], BIGINT) + con.execute("SELECT sum(pyf(a)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(pyf(a)) FROM t").fetchall()) + finally: + con.close() + + +@pytest.mark.parametrize("threads", THREADS) +def test_udf_arrow_parallel(benchmark: BenchmarkFixture, threads: int) -> None: + con = duckdb.connect(config={"threads": threads}) + try: + con.execute(f"CREATE TABLE t AS SELECT i AS a FROM range({N_UDF_ARROW}) s(i)") # materialized -> parallel scan + con.create_function("af", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + con.execute("SELECT sum(af(a)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(af(a)) FROM t").fetchall()) + finally: + con.close() diff --git a/benchmarks/test_engine_control_perf.py b/benchmarks/test_engine_control_perf.py new file mode 100644 index 00000000..faee4de4 --- /dev/null +++ b/benchmarks/test_engine_control_perf.py @@ -0,0 +1,50 @@ +"""Pure-engine floor (no Python egress): the binding-fraction reference. See benchmarks/README.md. + +`SELECT sum(...) FROM range(N)` aggregates to one scalar, so the fetch is negligible: these measure SQL compile + +the engine aggregate with ~zero per-row egress. Comparing a produce/fetch bench against the matching-N floor here +quantifies how much of its cost is binding vs engine. Informational (they measure the engine), never gated. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +pytestmark = pytest.mark.informational + +# N matched to the benches these floor, and routed through scaled() with the SAME base N, so the floor and its +# bench stay at an identical scaled N and the binding fraction stays valid. The 2048 small-N floor is NOT scaled. +Q_1C_SMALL = "SELECT sum(i::BIGINT) FROM range(2048) t(i)" # small-N gate floor (compile-dominated) +Q_1C_100K = f"SELECT sum(i::BIGINT) FROM range({scaled(100_000)}) t(i)" # types-matrix numeric-df floor +Q_1C_200K = f"SELECT sum(i::BIGINT) FROM range({scaled(200_000)}) t(i)" # fetch / native-UDF floor +Q_2C_500K = ( # produce/ingest floor + f"SELECT sum(a), sum(b) FROM (SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({scaled(500_000)}) t(i))" +) + + +def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: + con.execute(query).fetchall() # warm + benchmark(lambda: con.execute(query).fetchall()) + + +def test_engine_sum_1col_small(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench(benchmark, con, Q_1C_SMALL) + + +def test_engine_sum_1col_100k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench(benchmark, con, Q_1C_100K) + + +def test_engine_sum_1col_200k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench(benchmark, con, Q_1C_200K) + + +def test_engine_sum_2col_500k(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench(benchmark, con, Q_2C_500K) diff --git a/benchmarks/test_fetch_perf.py b/benchmarks/test_fetch_perf.py new file mode 100644 index 00000000..1aa5f4fe --- /dev/null +++ b/benchmarks/test_fetch_perf.py @@ -0,0 +1,120 @@ +"""OUT-row fetch: fetchall, fetchone/fetchmany loops, wide/expensive scalar types. See benchmarks/README.md.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +# gate: OUT-row fetch materializes every row to Python (binding-dominated); the range() scan is cheap. +pytestmark = pytest.mark.gate + +# scaled() shrinks N under BENCH_SCALE in the CI sweep; full N locally. The range(2048) *_gate probes are the +# compile+fetch fixed-cost baseline and are deliberately NOT scaled. +N_ROW = scaled(200_000) # numeric fetch (BIGINT/INTEGER/DOUBLE/2col/null/decimal128) +N_STR = scaled(100_000) # varchar/blob/mixed-wide/timestamptz + fetchone/fetchmany loops +N_NEST = scaled(50_000) # heterogeneous scalar/list/struct row + + +def _bench_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: + con.execute(query).fetchall() # warm the engine before measuring + benchmark(lambda: con.execute(query).fetchall()) + + +def test_fetchall_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a FROM range({N_ROW}) t(i)") + + +def test_fetchall_smallint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT (i % 100)::INTEGER AS a FROM range({N_ROW}) t(i)") + + +def test_fetchall_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT (i * 1.5)::DOUBLE AS a FROM range({N_ROW}) t(i)") + + +def test_fetchall_2int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range({N_ROW}) t(i)") + + +def test_fetchall_str(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT ('str_value_' || i) AS s FROM range({N_STR}) t(i)") + + +def test_fetchall_mixed(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + query = ( + "SELECT i::BIGINT AS bi, ('str_' || i) AS s, [i, i + 1, i + 2] AS lst, " + f"{{'a': i, 'b': i + 1}} AS st FROM range({N_NEST}) t(i)" + ) + _bench_fetchall(benchmark, con, query) + + +def test_fetchone_iter(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)" + + def run() -> None: + rel = con.execute(query) + while rel.fetchone() is not None: + pass + + benchmark(run) + + +# small-N *_gate variants: at range(2048) the measured region is ~60% SQL compile + engine, ~40% fetch, so these +# catch a fixed-cost regression (not a pure per-row one). Plus expensive scalar types (timestamptz pytz-per-row, +# blob, null-heavy), a heterogeneous per-cell-dispatch row, and the batched fetchmany loop. + + +def test_fetchall_int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a FROM range(2048) t(i)") + + +def test_fetchall_2int_gate(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, "SELECT i::BIGINT AS a, (i + 1)::BIGINT AS b FROM range(2048) t(i)") + + +def test_fetchall_null_heavy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END FROM range({N_ROW}) t(i)") + + +def test_fetchall_timestamptz(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall( + benchmark, con, f"SELECT (TIMESTAMPTZ '2020-01-01' + (i * INTERVAL 1 SECOND)) FROM range({N_STR}) t(i)" + ) + + +def test_fetchall_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT ((i * 1.5)::DECIMAL(28, 6)) FROM range({N_ROW}) t(i)") + + +def test_fetchall_blob(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_fetchall(benchmark, con, f"SELECT ('blob_value_' || i)::BLOB FROM range({N_STR}) t(i)") + + +def test_fetchall_mixed_wide(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + # heterogeneous row: per-cell type dispatch in the Fetchone loop (distinct branch/cache profile from the + # homogeneous single-type columns above) + query = ( + "SELECT (i::HUGEINT * 1000000000000) AS h, gen_random_uuid() AS u, " + f"((i * 1.5)::DECIMAL(28, 6)) AS d, ('string_' || i) AS s FROM range({N_STR}) t(i)" + ) + _bench_fetchall(benchmark, con, query) + + +def test_fetchmany_batched(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + query = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N_STR}) t(i)" + + def run() -> None: + rel = con.execute(query) + while True: + rows = rel.fetchmany(10_000) + if not rows: + break + + benchmark(run) diff --git a/benchmarks/test_ingest_native_perf.py b/benchmarks/test_ingest_native_perf.py new file mode 100644 index 00000000..3478ea1c --- /dev/null +++ b/benchmarks/test_ingest_native_perf.py @@ -0,0 +1,85 @@ +"""Native Python-object ingest: values() list/tuple/dict, executemany. See benchmarks/README.md. + +Every cell goes through TransformPythonValue; dicts recurse to STRUCT; executemany re-binds per row. Note: one +list arg to values() is ONE row whose columns are the list items, so a list of N items transforms N cells. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +EXECMANY_N = scaled(20_000) # executemany re-binds + executes per row, keep moderate +WIDE_N = scaled(10_000) # values() builds a 1-row x N-col relation; cap N so the binder stays sane + +# gate: native ingest eagerly transforms every cell / re-binds per row; the engine side is negligible. +pytestmark = pytest.mark.gate + + +@pytest.fixture(scope="module") +def rows_3col() -> list[tuple[int, float, str]]: + return [(i, i * 1.5, f"str_value_{i}") for i in range(EXECMANY_N)] + + +@pytest.fixture(scope="module") +def scalars_wide() -> list[int]: + return list(range(WIDE_N)) + + +@pytest.fixture(scope="module") +def tuples_wide() -> list[tuple[int, int, int]]: + return [(i, i + 1, i + 2) for i in range(WIDE_N)] + + +@pytest.fixture(scope="module") +def dicts_wide() -> list[dict[str, int | str]]: + return [{"a": i, "b": i + 1, "c": f"s{i}"} for i in range(WIDE_N)] + + +# executemany: bind + execute one parameter set per row, into a real table (CREATE OR REPLACE so it doesn't grow). + + +def test_ingest_executemany_3col( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, rows_3col: list[tuple[int, float, str]] +) -> None: + con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") + con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) # warm + + def run() -> None: + con.execute("CREATE OR REPLACE TABLE t (a BIGINT, b DOUBLE, c VARCHAR)") + con.executemany("INSERT INTO t VALUES (?, ?, ?)", rows_3col) + + benchmark(run) + + +# values(): EAGER per-cell TransformPythonValue. Drain with fetchall to complete the round-trip. + + +def test_ingest_values_scalars( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, scalars_wide: list[int] +) -> None: + con.values(scalars_wide).fetchall() # warm + benchmark(lambda: con.values(scalars_wide).fetchall()) + + +def test_ingest_values_tuples( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, tuples_wide: list[tuple[int, int, int]] +) -> None: + # each tuple cell -> LIST value (TransformPythonValue recursion) + con.values(tuples_wide).fetchall() # warm + benchmark(lambda: con.values(tuples_wide).fetchall()) + + +def test_ingest_values_dicts( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, dicts_wide: list[dict[str, int | str]] +) -> None: + # each dict cell -> STRUCT value (TransformDictionaryToStruct recursion) + con.values(dicts_wide).fetchall() # warm + benchmark(lambda: con.values(dicts_wide).fetchall()) diff --git a/benchmarks/test_ingest_numpy_perf.py b/benchmarks/test_ingest_numpy_perf.py new file mode 100644 index 00000000..61244d2c --- /dev/null +++ b/benchmarks/test_ingest_numpy_perf.py @@ -0,0 +1,111 @@ +"""numpy ingest: object-string scan, NaN-to-NULL, masked scan, analyzer bind. See benchmarks/README.md. + +Gotchas: the object-string bench MUST mix ASCII + non-ASCII + a null or it misses the transcode ladder (see +README traps); analyzer bind is the one place count(*) is correct (cost is at bind, not scan). +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +import numpy as np +import pandas as pd + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +# scaling changes ONLY the row count, never the mixed ASCII+non-ASCII+null pattern below. +N = scaled(500_000) +ANALYZER_N = scaled(200_000) + +NPDICT = {"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5} + +# mixed ASCII + non-ASCII + null sentinel -> forces the transcode + null-detection ladder (NOT ASCII-only) +_MIXED = ["ascii_value_", "café_", "naïve_", "日本語_", None] +_MIXED_STRINGS = [None if _MIXED[i % 5] is None else f"{_MIXED[i % 5]}{i}" for i in range(N)] + +# mixed python types in an object column -> the analyzer must sample/widen through the type ladder at bind +_MIXED_TYPES = [(i if i % 3 == 0 else (float(i) if i % 3 == 1 else f"s{i}")) for i in range(ANALYZER_N)] + +# READ (sum over a registered frame) is engine-aggregate dominated -> informational. The analyzer BIND (count(*), +# no scan) is a pure per-bind binding cost -> gate. + + +@pytest.fixture(scope="module") +def df_double_with_nan() -> pd.DataFrame: + a = np.arange(N, dtype="float64") * 1.5 + a[::10] = np.nan # real NaNs -> NaN-to-NULL conversion loop + return pd.DataFrame({"a": a}) + + +@pytest.fixture(scope="module") +def df_object_string_mixed() -> pd.DataFrame: + return pd.DataFrame({"s": pd.array(_MIXED_STRINGS, dtype=object)}) + + +@pytest.fixture(scope="module") +def df_masked_int() -> pd.DataFrame: + # pandas nullable Int64 -> numpy values + validity mask -> ScanNumpyMasked + ApplyMask + arr = pd.array(np.arange(N), dtype="Int64") + arr[::10] = pd.NA + return pd.DataFrame({"a": arr}) + + +@pytest.fixture(scope="module") +def df_object_mixed_types() -> pd.DataFrame: + return pd.DataFrame({"v": pd.array(_MIXED_TYPES, dtype=object)}) + + +# READ: numpy -> duckdb. sum/length force a full scan. + + +@pytest.mark.informational +def test_read_numpy_dict_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.register("npdict", NPDICT) # register explicitly, not via replacement-scan frame inspection + con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM npdict").fetchall()) + + +@pytest.mark.informational +def test_read_numpy_double_with_nan( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_double_with_nan: pd.DataFrame +) -> None: + con.register("t", df_double_with_nan) + con.execute("SELECT sum(a) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) + + +@pytest.mark.informational +def test_read_numpy_masked_int( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_masked_int: pd.DataFrame +) -> None: + con.register("t", df_masked_int) + con.execute("SELECT sum(a) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a) FROM t").fetchall()) + + +@pytest.mark.informational +def test_read_numpy_object_string_mixed( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_string_mixed: pd.DataFrame +) -> None: + con.register("t", df_object_string_mixed) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) + + +# BIND: PandasAnalyzer sampling cost. count(*) is correct HERE ONLY: the cost is at bind, so forcing a scan would +# drown the per-bind signal. Re-binds the object column each call. + + +@pytest.mark.gate +def test_bind_analyzer_object( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_object_mixed_types: pd.DataFrame +) -> None: + con.register("t", df_object_mixed_types) + con.execute("SELECT count(*) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(*) FROM t").fetchall()) diff --git a/benchmarks/test_pandas_perf.py b/benchmarks/test_pandas_perf.py new file mode 100644 index 00000000..4edc78dc --- /dev/null +++ b/benchmarks/test_pandas_perf.py @@ -0,0 +1,133 @@ +"""pandas read/write, numpy-backed vs arrow-backed frames. See benchmarks/README.md. + +Column backing selects the path: numpy-backed -> NumpyArray scan; arrow-backed (ArrowDtype) -> zero-copy arrow. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyarrow as pa +import pytest +from _scale import scaled + +import numpy as np +import pandas as pd + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +N = scaled(500_000) +WRITE_Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" +WRITE_Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" +_STRINGS = [f"str_value_{i}" for i in range(N)] + +# READ (sum over a registered frame) is engine-aggregate dominated -> informational. Only the NUMPY-backed df() +# WRITE is binding-dominated -> gate; the arrow-backed WRITE goes through pyarrow's to_pandas -> informational. + + +@pytest.fixture(scope="module") +def df_numpy_numeric() -> pd.DataFrame: + return pd.DataFrame({"a": np.arange(N, dtype="int64"), "b": np.arange(N, dtype="float64") * 1.5}) + + +@pytest.fixture(scope="module") +def df_numpy_string() -> pd.DataFrame: + # explicit object dtype -> the reworked numpy-backed object-string / analyzer path + return pd.DataFrame({"s": pd.array(_STRINGS, dtype=object)}) + + +@pytest.fixture(scope="module") +def df_arrow_numeric() -> pd.DataFrame: + return pd.DataFrame( + { + "a": pd.array(np.arange(N), dtype=pd.ArrowDtype(pa.int64())), + "b": pd.array(np.arange(N) * 1.5, dtype=pd.ArrowDtype(pa.float64())), + } + ) + + +@pytest.fixture(scope="module") +def df_arrow_string() -> pd.DataFrame: + return pd.DataFrame({"s": pd.array(_STRINGS, dtype=pd.ArrowDtype(pa.string()))}) + + +# READ: pandas -> duckdb. sum/length force a full scan. + + +@pytest.mark.informational +def test_read_pandas_numpy_numeric( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_numeric: pd.DataFrame +) -> None: + con.register("t", df_numpy_numeric) + con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) + + +@pytest.mark.informational +def test_read_pandas_numpy_string( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_numpy_string: pd.DataFrame +) -> None: + con.register("t", df_numpy_string) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) + + +@pytest.mark.informational +def test_read_pandas_arrow_numeric( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_numeric: pd.DataFrame +) -> None: + con.register("t", df_arrow_numeric) + con.execute("SELECT sum(a), sum(b) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT sum(a), sum(b) FROM t").fetchall()) + + +@pytest.mark.informational +def test_read_pandas_arrow_string( + benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, df_arrow_string: pd.DataFrame +) -> None: + con.register("t", df_arrow_string) + con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall() # warm + benchmark(lambda: con.execute("SELECT count(s), sum(length(s)) FROM t").fetchall()) + + +# WRITE: duckdb -> pandas. df() is the reworked numpy-backed path; the arrow-backed frame goes via +# duckdb-arrow + pyarrow.to_pandas(ArrowDtype). Both eagerly materialize the whole frame. + + +@pytest.mark.gate +def test_write_pandas_numpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_NUM).df()) + + +@pytest.mark.gate +def test_write_pandas_numpy_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_STR).df()) + + +@pytest.mark.gate +def test_write_pandas_numpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + # REAL nulls -> the masked_array build + masked-to-pd.NA rewrite the cutover reworked (see README traps) + q = ( + "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " + f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)" + ) + benchmark(lambda: con.sql(q).df()) + + +@pytest.mark.gate +def test_write_pandas_numpy_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + q = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)" + benchmark(lambda: con.sql(q).df()) + + +@pytest.mark.informational # to_pandas() half is pyarrow library code +def test_write_pandas_arrow_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_NUM).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) + + +@pytest.mark.informational # to_pandas() half is pyarrow library code +def test_write_pandas_arrow_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + benchmark(lambda: con.sql(WRITE_Q_STR).to_arrow_table().to_pandas(types_mapper=pd.ArrowDtype)) diff --git a/benchmarks/test_produce_numpy_perf.py b/benchmarks/test_produce_numpy_perf.py new file mode 100644 index 00000000..f7a103da --- /dev/null +++ b/benchmarks/test_produce_numpy_perf.py @@ -0,0 +1,148 @@ +"""Columnar produce: df(), fetchnumpy(), fetch_df_chunk(), per type, null vs no-null. See benchmarks/README.md. + +Covers the with-NULLS masked_array branch, datetime, and wide-internal types (hugeint/uuid/decimal128). +""" + +from __future__ import annotations + +import gc +import sys +import tracemalloc +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +import duckdb +import numpy as np # noqa: F401 (pinned identically A/B so the env matches the other modules) + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + +N = scaled(500_000) +TYPE_N = scaled(200_000) # wide-internal types (hugeint/uuid/decimal128) are heavier per cell + +Q_NUM = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({N}) t(i)" +Q_NUM_NULLS = ( + "SELECT CASE WHEN i % 10 = 0 THEN NULL ELSE i::BIGINT END AS a, " + f"CASE WHEN i % 10 = 0 THEN NULL ELSE (i * 1.5)::DOUBLE END AS b FROM range({N}) t(i)" +) +Q_STR = f"SELECT ('str_value_' || i) AS s FROM range({N}) t(i)" +Q_TS = f"SELECT TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND) AS t FROM range({N}) t(i)" +Q_HUGEINT = f"SELECT (i::HUGEINT * 1000000000000) AS h FROM range({TYPE_N}) t(i)" +Q_UUID = f"SELECT gen_random_uuid() AS u FROM range({TYPE_N}) t(i)" +Q_DEC128 = f"SELECT ((i * 1.5)::DECIMAL(28, 6)) AS d FROM range({TYPE_N}) t(i)" + + +# gate: df()/fetchnumpy() fully materialize numpy-backed columns (ArrayWrapper fill, binding-dominated). +def _bench_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: + con.sql(query).df() # warm + benchmark(lambda: con.sql(query).df()) + + +def _bench_numpy(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: + con.sql(query).fetchnumpy() # warm + benchmark(lambda: con.sql(query).fetchnumpy()) + + +# df(): the production numpy-backed columnar path. no-null vs REAL-null vs string vs timestamp vs wide types. + + +@pytest.mark.informational +def test_df_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_NUM) + + +@pytest.mark.gate +def test_df_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_NUM_NULLS) # REAL nulls -> masked_array branch (see README traps) + + +@pytest.mark.gate +def test_df_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_STR) + + +@pytest.mark.gate +def test_df_timestamp(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_TS) + + +@pytest.mark.gate +def test_df_hugeint(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_HUGEINT) + + +@pytest.mark.gate +def test_df_uuid(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_UUID) + + +@pytest.mark.gate +def test_df_decimal128(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_df(benchmark, con, Q_DEC128) + + +# fetchnumpy(): same FetchNumpyInternal, without the DataFrame wrap. + + +@pytest.mark.informational +def test_fetchnumpy_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_numpy(benchmark, con, Q_NUM) + + +@pytest.mark.gate +def test_fetchnumpy_numeric_with_nulls(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + _bench_numpy(benchmark, con, Q_NUM_NULLS) + + +@pytest.mark.informational # per-chunk streaming drain (GIL-per-chunk), not gated +def test_fetch_df_chunk_loop(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + def run() -> int: + rel = con.sql(Q_NUM) + rows = 0 + while True: + chunk = rel.fetch_df_chunk() + if len(chunk) == 0: + break + rows += len(chunk) + return rows + + con.sql(Q_NUM).fetch_df_chunk() # warm + benchmark(run) + + +@pytest.mark.informational # torch is local-only (importorskip); torch lib work dilutes it +def test_torch_numeric(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + pytest.importorskip("torch") + q = f"SELECT i::BIGINT AS a, (i * 1.5)::DOUBLE AS b FROM range({TYPE_N}) t(i)" + con.sql(q).torch() # warm + benchmark(lambda: con.sql(q).torch()) + + +# Memory guard (secondary signal, not a codspeed benchmark; codspeed walltime tracks neither memory nor allocs). +# tracemalloc captures the PEAK Python-tracked allocation of ONE df()-with-nulls call. reset_peak() runs AFTER +# the warm so the warm does not set a high-water mark that swallows the measured call. tracemalloc reports bytes +# on every platform (portable to Linux CI). CAVEAT: it only sees Python-level allocs, not the C numpy buffers, so +# it catches a gross Python-object blowup (masked-to-pd.NA gone wrong) but is not a total-RSS gate; that is +# codspeed memory mode's job (deferred, see PLAN.md). + + +def test_mem_df_with_nulls() -> None: + con = duckdb.connect(config={"threads": 1}) + try: + tracemalloc.start() + warm = con.sql(Q_NUM_NULLS).df() # populate one-time import / type caches + del warm + gc.collect() + tracemalloc.reset_peak() # discount the warm's transient peak BEFORE the measured call + out = con.sql(Q_NUM_NULLS).df() + _current, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + del out + finally: + con.close() + print(f"\n[mem] df()-with-nulls tracemalloc peak = {peak / 1e6:.1f} MB", file=sys.stderr) + # a 500k x 2-col masked df is a few MB of Python-tracked allocs; a gross blowup is tens+ MB. 100 MB ceiling + # catches that without flaking. + assert peak < 100_000_000 diff --git a/benchmarks/test_relational_construction_perf.py b/benchmarks/test_relational_construction_perf.py new file mode 100644 index 00000000..bd494c2e --- /dev/null +++ b/benchmarks/test_relational_construction_perf.py @@ -0,0 +1,31 @@ +"""Relational-API expression construction. Informational, out of the binding gate. See benchmarks/README.md. + +This is expression *construction* (ColumnExpression / ConstantExpression / operator overloads), not the +binding-pressure surface the rest of the suite targets. Kept because it carries a real signal (a measured ~35% +construction delta at the cutover), but never part of the gate. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import duckdb + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + +pytestmark = pytest.mark.informational + + +def test_expr_many(benchmark: BenchmarkFixture) -> None: + def run() -> int: + out = [] + for i in range(2000): + col = duckdb.ColumnExpression(f"col_{i}") + const = duckdb.ConstantExpression(i) + out.append(((col + const) * duckdb.ConstantExpression(2)).alias(f"a{i}")) + return len(out) + + benchmark(run) diff --git a/benchmarks/test_types_roundtrip_perf.py b/benchmarks/test_types_roundtrip_perf.py new file mode 100644 index 00000000..9cc8d6b3 --- /dev/null +++ b/benchmarks/test_types_roundtrip_perf.py @@ -0,0 +1,72 @@ +"""type x direction produce matrix: fetchall / df / to_arrow per logical type. See benchmarks/README.md. + +One logical type per column across three directions, so a regression localizes to (type, direction). Includes the +wide types the narrow-numeric benches miss: hugeint, uuid, decimal128, long varchar. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +N = scaled(100_000) + +# one logical type per column; long-varchar is intentionally > 64 chars +TYPE_EXPR = { + "int64": "i::BIGINT", + "double": "(i * 1.5)::DOUBLE", + "varchar_short": "('str_' || i)", + "varchar_long": "('row_' || i || '_' || repeat('payload ', 9))", + "date": "DATE '2020-01-01' + (i % 3650)::INTEGER", + "bool": "(i % 2 = 0)", + "timestamp": "TIMESTAMP '2020-01-01' + (i * INTERVAL 1 SECOND)", + "decimal64": "((i::DECIMAL(18, 3)) / 1000)", + "decimal128": "((i * 1.5)::DECIMAL(28, 6))", + "hugeint": "(i::HUGEINT * 1000000000000)", + "uuid": "gen_random_uuid()", + "struct": "{'a': i, 'b': i + 1}", + "list": "[i, i + 1, i + 2]", +} +TYPES = list(TYPE_EXPR) + +# OUT-col bool/int64 are engine-diluted below the Option-B cutoff (binding_fraction < 0.25, see baseline.json): the +# numpy column fill is trivial next to the engine scan, so they are informational while the other types stay gate. +# OUT-row is unaffected (fetchall builds a Python object per cell, binding-dominated for every type). +_OUT_COL_DILUTED = {"bool", "int64"} +_OUT_COL_PARAMS = [ + pytest.param(t, marks=pytest.mark.informational if t in _OUT_COL_DILUTED else pytest.mark.gate) for t in TYPES +] + + +def _query(type_name: str) -> str: + return f"SELECT {TYPE_EXPR[type_name]} AS c FROM range({N}) t(i)" + + +@pytest.mark.gate # OUT-row: binding-dominated per-type dispatch +@pytest.mark.parametrize("type_name", TYPES) +def test_out_row_fetchall(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: + q = _query(type_name) + con.execute(q).fetchall() # warm + benchmark(lambda: con.execute(q).fetchall()) + + +@pytest.mark.parametrize("type_name", _OUT_COL_PARAMS) # OUT-col: ArrayWrapper fill; gate per type except diluted ones +def test_out_col_df(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: + q = _query(type_name) + con.sql(q).df() # warm + benchmark(lambda: con.sql(q).df()) + + +@pytest.mark.informational # to_arrow_table re-runs the query GIL-released (engine-parallel, noisy) -> not gated +@pytest.mark.parametrize("type_name", TYPES) +def test_out_arrow_table(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, type_name: str) -> None: + q = _query(type_name) + con.sql(q).to_arrow_table() # warm + benchmark(lambda: con.sql(q).to_arrow_table()) diff --git a/benchmarks/test_udf_perf.py b/benchmarks/test_udf_perf.py new file mode 100644 index 00000000..0f381ca7 --- /dev/null +++ b/benchmarks/test_udf_perf.py @@ -0,0 +1,103 @@ +"""Python UDFs: native scalar (one call per row) and vectorized arrow (one call per chunk). See benchmarks/README.md. + +Each UDF is wrapped in a sum()/length() aggregate so the engine runs it on every row. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from _scale import scaled + +from duckdb.sqltypes import BIGINT, DOUBLE, VARCHAR + +if TYPE_CHECKING: + from pytest_codspeed import BenchmarkFixture + + import duckdb + +pa = pytest.importorskip("pyarrow") +pc = pytest.importorskip("pyarrow.compute") + +NATIVE_N = scaled(200_000) # native = one Python call per row, keep moderate +ARROW_N = scaled(1_000_000) # arrow = one Python call per chunk (vectorized), can be large + + +def _bench(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection, query: str) -> None: + con.execute(query).fetchall() # warm the engine + import caches + benchmark(lambda: con.execute(query).fetchall()) + + +# NATIVE scalar UDF: per-row TupleBuilder(args) + PyObject_CallObject + TransformPythonObject(result). The Python +# call dominates; the sum() consume is negligible -> gate. + + +@pytest.mark.gate +def test_udf_native_int_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) + _bench(benchmark, con, f"SELECT sum(add_one(i::BIGINT)) FROM range({NATIVE_N}) t(i)") + + +@pytest.mark.gate +def test_udf_native_int_2arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.create_function("add2", lambda a, b: a + b, [BIGINT, BIGINT], BIGINT) + _bench(benchmark, con, f"SELECT sum(add2(i::BIGINT, (i + 1)::BIGINT)) FROM range({NATIVE_N}) t(i)") + + +@pytest.mark.gate +def test_udf_native_double_1arg(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.create_function("scale", lambda x: x * 1.5, [DOUBLE], DOUBLE) + _bench(benchmark, con, f"SELECT sum(scale((i * 1.0)::DOUBLE)) FROM range({NATIVE_N}) t(i)") + + +@pytest.mark.gate +def test_udf_native_string(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.create_function("up", lambda s: s.upper(), [VARCHAR], VARCHAR) + _bench( + benchmark, + con, + f"SELECT sum(length(up(s))) FROM (SELECT ('str_value_' || i) AS s FROM range({NATIVE_N}) t(i))", + ) + + +@pytest.mark.gate +def test_udf_native_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + # DEFAULT null handling short-circuits NULL inputs (SetNull) WITHOUT calling the UDF: measures the validity + # short-circuit, so the UDF only ever sees non-NULL rows. + con.create_function("add_one", lambda x: x + 1, [BIGINT], BIGINT) + _bench( + benchmark, + con, + "SELECT sum(add_one(v)) FROM " + f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({NATIVE_N}) t(i))", + ) + + +# ARROW (vectorized) UDF: ConvertDataChunkToPyArrowTable -> pc op -> ConvertArrowTableToVector cast. pyarrow lib +# work + per-chunk conversion + 1M engine -> informational. + + +@pytest.mark.informational +def test_udf_arrow_int(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + _bench(benchmark, con, f"SELECT sum(arrow_add_one(i::BIGINT)) FROM range({ARROW_N}) t(i)") + + +@pytest.mark.informational +def test_udf_arrow_double(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + con.create_function("arrow_scale", lambda x: pc.multiply(x, 1.5), [DOUBLE], DOUBLE, type="arrow") + _bench(benchmark, con, f"SELECT sum(arrow_scale((i * 1.0)::DOUBLE)) FROM range({ARROW_N}) t(i)") + + +@pytest.mark.informational +def test_udf_arrow_null_inputs(benchmark: BenchmarkFixture, con: duckdb.DuckDBPyConnection) -> None: + # DEFAULT null handling on the vectorized path compacts the validity (selvec) before the call and reconstructs + # the result vector after: this measures the selvec compaction/reconstruction cost. + con.create_function("arrow_add_one", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + _bench( + benchmark, + con, + "SELECT sum(arrow_add_one(v)) FROM " + f"(SELECT CASE WHEN i % 2 = 0 THEN NULL ELSE i::BIGINT END AS v FROM range({ARROW_N}) t(i))", + ) diff --git a/pyproject.toml b/pyproject.toml index 53cfa616..12cad096 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,6 +250,7 @@ test = [ # dependencies used for running tests "pytest-timeout", "pytest-timestamper", "pytest-xdist", # parallel test execution (-n auto); without this `uv sync --reinstall` prunes a manual install + "pytest_codspeed", "coverage", "gcovr; sys_platform != 'win32' or platform_machine != 'ARM64'", "gcsfs; sys_platform != 'win32' or platform_machine != 'ARM64'", @@ -280,6 +281,20 @@ test = [ # dependencies used for running tests "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'", "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'", ] +bench = [ # Pinned deps for the benchmark suite (see benchmarks/README.md). Minimal, not the heavy `test` group. + # Constraints mirror `test` so the lockfile resolves identically; torch/tf are local-only (importorskip). + "pytest", + "pytest_codspeed", + "polars>=1.33.0", + "pytz", + "numpy<2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version < '3.12'", + "numpy>=2; ( sys_platform != 'win32' or platform_machine != 'ARM64' ) and python_version >= '3.12'", + "numpy>=2.3; sys_platform == 'win32' and platform_machine == 'ARM64' and python_version >= '3.11'", + "pandas>=3.0.0; python_version > '3.10'", + "pandas<3.0.0; python_version < '3.11'", + "pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", + "pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", +] scripts = [ # dependencies used for running scripts "cxxheaderparser", "ipython", @@ -440,6 +455,10 @@ strict = true # No need for type hinting in tests 'ANN001', 'ANN201', 'ANN202' ] +"benchmarks/**.py" = [ + # benchmarks are test-like: docstrings optional (shared context lives in benchmarks/README.md) + 'D100', 'D101', 'D102', 'D103', 'D104', 'D105', 'D107', +] "tests/fast/spark/**.py" = [ "E402" ] diff --git a/tests/fast/test_binding_pressure_leak.py b/tests/fast/test_binding_pressure_leak.py new file mode 100644 index 00000000..1ffd596c --- /dev/null +++ b/tests/fast/test_binding_pressure_leak.py @@ -0,0 +1,109 @@ +"""Sustained-iteration leak guards for the binding object-pinning paths. + +CodSpeed measures per-call cost and can't see a refcount imbalance in the object-pinning graph until it OOMs, so +this plain assertion test runs each pinning path N times and asserts RSS and object growth stay flat. Covers what +test_relation_dependency_leak.py does not: register/unregister, native + arrow UDF create/run/remove, executemany. +""" + +import gc +import os + +import pytest + +import numpy as np +import pandas as pd + +try: + import pyarrow as pa + + can_arrow = True +except ImportError: + can_arrow = False + +from duckdb.sqltypes import BIGINT + +psutil = pytest.importorskip("psutil") + +ITERS = 100 +ROWS = 100_000 +_EM_ROWS = [(i, i * 1.5, f"s{i}") for i in range(5_000)] + + +def _rss_gb(): + return psutil.Process(os.getpid()).memory_info().rss / (10**9) + + +def check_flat(fn, cursor, iters=ITERS, obj_slack=20_000): + """Assert RSS and tracked-object count stay flat across `iters` calls of `fn`.""" + fn(cursor) # warm one-time caches so they are not counted as growth + gc.collect() + start_rss = _rss_gb() + start_obj = len(gc.get_objects()) + for _ in range(iters): + fn(cursor) + gc.collect() + end_rss = _rss_gb() + end_obj = len(gc.get_objects()) + # RSS ratio bound mirrors test_relation_dependency_leak.py (growth must stay well under 3x)... + assert end_rss / 3 < start_rss, f"RSS grew {start_rss:.3f} -> {end_rss:.3f} GB over {iters} iters" + # ...plus an object-count bound, which catches a Python-object pin that is too small to move RSS. + assert end_obj - start_obj < obj_slack, f"tracked objects grew by {end_obj - start_obj} over {iters} iters" + + +# --------------------------------------------------------------------------- # +# Pinning paths (one full pin/unpin cycle per call). +# --------------------------------------------------------------------------- # + + +def register_unregister_arrow(cursor): + tbl = pa.table({"a": pa.array(np.arange(ROWS), type=pa.int64())}) + cursor.register("t_reg", tbl) + cursor.execute("SELECT sum(a) FROM t_reg").fetchall() + cursor.unregister("t_reg") + + +def register_unregister_pandas(cursor): + df = pd.DataFrame({"a": np.arange(ROWS)}) + cursor.register("t_reg", df) + cursor.execute("SELECT sum(a) FROM t_reg").fetchall() + cursor.unregister("t_reg") + + +def native_udf_cycle(cursor): + cursor.create_function("f_leak", lambda x: x + 1, [BIGINT], BIGINT) + cursor.execute("SELECT sum(f_leak(i::BIGINT)) FROM range(10000) t(i)").fetchall() + cursor.remove_function("f_leak") + + +def arrow_udf_cycle(cursor): + import pyarrow.compute as pc + + cursor.create_function("af_leak", lambda x: pc.add(x, 1), [BIGINT], BIGINT, type="arrow") + cursor.execute("SELECT sum(af_leak(i::BIGINT)) FROM range(50000) t(i)").fetchall() + cursor.remove_function("af_leak") + + +def executemany_cycle(cursor): + cursor.execute("CREATE OR REPLACE TABLE t_em (a BIGINT, b DOUBLE, c VARCHAR)") + cursor.executemany("INSERT INTO t_em VALUES (?, ?, ?)", _EM_ROWS) + + +class TestBindingPressureLeak: + def test_register_unregister_arrow_leak(self, duckdb_cursor): + if not can_arrow: + pytest.skip("pyarrow not installed") + check_flat(register_unregister_arrow, duckdb_cursor) + + def test_register_unregister_pandas_leak(self, duckdb_cursor): + check_flat(register_unregister_pandas, duckdb_cursor) + + def test_native_udf_cycle_leak(self, duckdb_cursor): + check_flat(native_udf_cycle, duckdb_cursor) + + def test_arrow_udf_cycle_leak(self, duckdb_cursor): + if not can_arrow: + pytest.skip("pyarrow not installed") + check_flat(arrow_udf_cycle, duckdb_cursor) + + def test_executemany_leak(self, duckdb_cursor): + check_flat(executemany_cycle, duckdb_cursor)