From c46bfe6f73349a7064a5c12d967dc45926ce9372 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 11:35:55 +0900
Subject: [PATCH 01/19] Add benchmark comparison gates

Implement repeated benchmark runs and the fedify bench compare command so CI
can compare a head ref against a base ref on the same runner.  The comparison
path checks out temporary worktrees, starts each target sequentially, and
fails when performance regressions exceed the configured tolerance plus the
measured noise band.

Publish benchmark report schema version 3 and comparison report schema
version 1, including fixtures and schema drift guards for both formats.

https://github.com/fedify-dev/fedify/issues/744
https://github.com/fedify-dev/fedify/issues/786

Assisted-by: Codex:gpt-5.5
---
 .../__fixtures__/compare-reports/basic.json   |  83 +++
 .../__fixtures__/reports/inbox-report.json    |   7 +-
 packages/cli/src/bench/action.test.ts         |  51 +-
 packages/cli/src/bench/action.ts              |  51 +-
 packages/cli/src/bench/command.test.ts        |  65 ++
 packages/cli/src/bench/command.ts             | 137 +++-
 packages/cli/src/bench/compare.test.ts        | 235 +++++++
 packages/cli/src/bench/compare.ts             | 623 ++++++++++++++++++
 packages/cli/src/bench/compare/schema.ts      | 146 ++++
 packages/cli/src/bench/mod.ts                 |  11 +-
 packages/cli/src/bench/render/render.test.ts  |   4 +-
 packages/cli/src/bench/result/build.test.ts   |  63 +-
 packages/cli/src/bench/result/build.ts        | 224 ++++++-
 packages/cli/src/bench/result/model.ts        |  18 +-
 packages/cli/src/bench/result/schema.ts       |  62 +-
 .../cli/src/bench/scenario/normalize.test.ts  |  24 +-
 packages/cli/src/bench/scenario/normalize.ts  |   8 +-
 packages/cli/src/bench/schema.test.ts         |   1 +
 packages/cli/src/bench/schemas.ts             |  17 +-
 schema/bench/compare-report-v1.json           | 231 +++++++
 schema/bench/report-v3.json                   | 584 ++++++++++++++++
 21 files changed, 2551 insertions(+), 94 deletions(-)
 create mode 100644 packages/cli/src/bench/__fixtures__/compare-reports/basic.json
 create mode 100644 packages/cli/src/bench/compare.test.ts
 create mode 100644 packages/cli/src/bench/compare.ts
 create mode 100644 packages/cli/src/bench/compare/schema.ts
 create mode 100644 schema/bench/compare-report-v1.json
 create mode 100644 schema/bench/report-v3.json

diff --git a/packages/cli/src/bench/__fixtures__/compare-reports/basic.json b/packages/cli/src/bench/__fixtures__/compare-reports/basic.json
new file mode 100644
index 000000000..75f712a3f
--- /dev/null
+++ b/packages/cli/src/bench/__fixtures__/compare-reports/basic.json
@@ -0,0 +1,83 @@
+{
+  "$schema": "https://json-schema.fedify.dev/bench/compare-report-v1.json",
+  "schemaVersion": 1,
+  "tool": { "name": "@fedify/cli", "version": "2.3.0" },
+  "environment": {
+    "runtime": "deno",
+    "runtimeVersion": "2.5.0",
+    "os": "linux",
+    "cpuCount": 16
+  },
+  "startedAt": "2026-06-04T12:00:00.000Z",
+  "finishedAt": "2026-06-04T12:03:00.000Z",
+  "suite": { "name": "Inbox regression suite", "configHash": "sha256:abc123" },
+  "maxRegression": 0.15,
+  "base": {
+    "ref": "origin/main",
+    "report": {
+      "$schema": "https://json-schema.fedify.dev/bench/report-v3.json",
+      "schemaVersion": 3,
+      "tool": { "name": "@fedify/cli", "version": "2.3.0" },
+      "environment": {
+        "runtime": "deno",
+        "runtimeVersion": "2.5.0",
+        "os": "linux",
+        "cpuCount": 16
+      },
+      "target": {
+        "url": "http://localhost:3000",
+        "fedifyVersion": "2.3.0",
+        "statsAvailable": true
+      },
+      "startedAt": "2026-06-04T12:00:00.000Z",
+      "finishedAt": "2026-06-04T12:01:00.000Z",
+      "suite": {
+        "name": "Inbox regression suite",
+        "configHash": "sha256:abc123"
+      },
+      "passed": true,
+      "scenarios": []
+    }
+  },
+  "head": {
+    "ref": "HEAD",
+    "report": {
+      "$schema": "https://json-schema.fedify.dev/bench/report-v3.json",
+      "schemaVersion": 3,
+      "tool": { "name": "@fedify/cli", "version": "2.3.0" },
+      "environment": {
+        "runtime": "deno",
+        "runtimeVersion": "2.5.0",
+        "os": "linux",
+        "cpuCount": 16
+      },
+      "target": {
+        "url": "http://localhost:3000",
+        "fedifyVersion": "2.3.0",
+        "statsAvailable": true
+      },
+      "startedAt": "2026-06-04T12:02:00.000Z",
+      "finishedAt": "2026-06-04T12:03:00.000Z",
+      "suite": {
+        "name": "Inbox regression suite",
+        "configHash": "sha256:abc123"
+      },
+      "passed": true,
+      "scenarios": []
+    }
+  },
+  "comparisons": [
+    {
+      "scenario": "inbox-shared",
+      "metric": "latency.p95",
+      "direction": "lower-is-better",
+      "base": 91,
+      "head": 94,
+      "regression": 0.03296703296703297,
+      "noiseBand": 0.02,
+      "allowedRegression": 0.16999999999999998,
+      "pass": true
+    }
+  ],
+  "passed": true
+}
diff --git a/packages/cli/src/bench/__fixtures__/reports/inbox-report.json b/packages/cli/src/bench/__fixtures__/reports/inbox-report.json
index b7ca535f8..aea7c800c 100644
--- a/packages/cli/src/bench/__fixtures__/reports/inbox-report.json
+++ b/packages/cli/src/bench/__fixtures__/reports/inbox-report.json
@@ -1,6 +1,6 @@
 {
-  "$schema": "https://json-schema.fedify.dev/bench/report-v2.json",
-  "schemaVersion": 2,
+  "$schema": "https://json-schema.fedify.dev/bench/report-v3.json",
+  "schemaVersion": 3,
   "tool": { "name": "@fedify/cli", "version": "2.3.0" },
   "environment": {
     "runtime": "deno",
@@ -86,7 +86,8 @@
           "pass": true
         }
       ],
-      "passed": true
+      "passed": true,
+      "runCount": 1
     }
   ]
 }
diff --git a/packages/cli/src/bench/action.test.ts b/packages/cli/src/bench/action.test.ts
index b58dd9733..da4be911a 100644
--- a/packages/cli/src/bench/action.test.ts
+++ b/packages/cli/src/bench/action.test.ts
@@ -6,11 +6,12 @@ import test from "node:test";
 import { serve } from "srvx";
 import { spawnBenchmarkTarget } from "../../test/bench/fixture.ts";
 import runBench, { withUserAgent } from "./action.ts";
-import type { BenchCommand } from "./command.ts";
+import type { BenchRunCommand } from "./command.ts";
 
-function command(overrides: Partial<BenchCommand>): BenchCommand {
+function command(overrides: Partial<BenchRunCommand>): BenchRunCommand {
   return {
     command: "bench",
+    mode: "run",
     scenario: "",
     target: undefined,
     format: "json",
@@ -19,7 +20,7 @@ function command(overrides: Partial<BenchCommand>): BenchCommand {
     allowUnsafeTarget: false,
     userAgent: "Fedify-bench-test/1.0",
     ...overrides,
-  } as BenchCommand;
+  } as BenchRunCommand;
 }
 
 async function writeSuite(content: string): Promise<string> {
@@ -170,6 +171,49 @@ test("runBench - dry run prints a plan and sends nothing", async () => {
   }
 });
 
+test("runBench - repeats a scenario according to runs", async () => {
+  const file = await writeSuite(`version: 1
+target: http://127.0.0.1:3000
+scenarios:
+  - name: wf
+    type: webfinger
+    recipient: "acct:alice@example.com"
+    runs: 2
+    load: { concurrency: 1 }
+    duration: 5ms
+`);
+  let code = -1;
+  let output = "";
+  await runBench(command({ scenario: file }), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: (c) => {
+      output = c;
+      return Promise.resolve();
+    },
+    log: () => {},
+    fetch: (input) => {
+      const url = new URL(input instanceof Request ? input.url : input);
+      if (url.pathname === "/.well-known/fedify/bench/stats") {
+        return Promise.resolve(new Response("not found", { status: 404 }));
+      }
+      if (url.pathname === "/.well-known/webfinger") {
+        return Promise.resolve(
+          new Response(JSON.stringify({ subject: "acct:alice@example.com" }), {
+            headers: { "content-type": "application/jrd+json" },
+          }),
+        );
+      }
+      return Promise.resolve(new Response("not found", { status: 404 }));
+    },
+  });
+  const report = JSON.parse(output);
+  assert.strictEqual(code, 0);
+  assert.strictEqual(report.scenarios[0].runCount, 2);
+  assert.strictEqual(report.scenarios[0].runs.length, 2);
+});
+
 test("runBench - dry run reports inbox discovery failures and continues", async () => {
   const target = await spawnBenchmarkTarget();
   try {
@@ -729,6 +773,7 @@ scenarios:
     type: inbox
     recipient: "${new URL("/users/alice", target.url).href}"
     inbox: "https://shared.staging.example/inbox"
+    runs: 1
     load: { concurrency: 2 }
     duration: 250ms
 `);
diff --git a/packages/cli/src/bench/action.ts b/packages/cli/src/bench/action.ts
index afc1f6f01..936ff1c75 100644
--- a/packages/cli/src/bench/action.ts
+++ b/packages/cli/src/bench/action.ts
@@ -4,7 +4,7 @@ import process from "node:process";
 import { getContextLoader, getDocumentLoader } from "../docloader.ts";
 import { describeError } from "../utils.ts";
 import { buildFleet } from "./actor/fleet.ts";
-import type { BenchCommand } from "./command.ts";
+import type { BenchRunCommand } from "./command.ts";
 import {
   type DiscoveredInbox,
   discoverInbox,
@@ -19,6 +19,7 @@ import {
   buildScenarioResult,
   configHash,
   detectEnvironment,
+  type ScenarioMeasurement,
 } from "./result/build.ts";
 import { probeBenchmarkMode } from "./discovery/probe.ts";
 import { renderReport, type ReportFormat } from "./render/index.ts";
@@ -76,7 +77,7 @@ export interface RunBenchDeps {
  * @param deps Injectable dependencies for testing.
  */
 export default async function runBench(
-  command: BenchCommand,
+  command: BenchRunCommand,
   deps: RunBenchDeps = {},
 ): Promise<void> {
   // Set the exit code rather than terminating, so cleanup (closing the fleet)
@@ -299,25 +300,33 @@ export default async function runBench(
     const results = [];
     for (let i = 0; i < suite.scenarios.length; i++) {
       const scenario = suite.scenarios[i];
-      log(`Running scenario "${scenario.name}" (${scenario.type})…`);
-      const measurement = await runners[i].run({
-        scenario,
-        scenarios: suite.scenarios,
-        target: suite.target,
-        documentLoader,
-        contextLoader,
-        allowPrivateAddress,
-        fleet: fleet ?? null,
-        advertiseHost: command.advertiseHost,
-        fetch: fetchImpl,
-        assertDestinationAllowed: (url, gateScenario) =>
-          assertDestinationAllowed(url, gateScenario ?? scenario),
-        assertReadDestinationAllowed: (url, gateScenario) =>
-          assertReadDestinationAllowed(url, gateScenario ?? scenario),
-        assertActorlessDestinationAllowed: (url, gateScenario) =>
-          assertActorlessDestinationAllowed(url, gateScenario ?? scenario),
-      });
-      results.push(buildScenarioResult(scenario, measurement));
+      const measurements: ScenarioMeasurement[] = [];
+      for (let run = 1; run <= scenario.runs; run++) {
+        const suffix = scenario.runs === 1
+          ? ""
+          : ` run ${run}/${scenario.runs}`;
+        log(`Running scenario "${scenario.name}" (${scenario.type})${suffix}…`);
+        measurements.push(
+          await runners[i].run({
+            scenario,
+            scenarios: suite.scenarios,
+            target: suite.target,
+            documentLoader,
+            contextLoader,
+            allowPrivateAddress,
+            fleet: fleet ?? null,
+            advertiseHost: command.advertiseHost,
+            fetch: fetchImpl,
+            assertDestinationAllowed: (url, gateScenario) =>
+              assertDestinationAllowed(url, gateScenario ?? scenario),
+            assertReadDestinationAllowed: (url, gateScenario) =>
+              assertReadDestinationAllowed(url, gateScenario ?? scenario),
+            assertActorlessDestinationAllowed: (url, gateScenario) =>
+              assertActorlessDestinationAllowed(url, gateScenario ?? scenario),
+          }),
+        );
+      }
+      results.push(buildScenarioResult(scenario, measurements));
     }
     const report = buildReport({
       scenarios: results,
diff --git a/packages/cli/src/bench/command.test.ts b/packages/cli/src/bench/command.test.ts
index eb3dad4fb..6f2af0708 100644
--- a/packages/cli/src/bench/command.test.ts
+++ b/packages/cli/src/bench/command.test.ts
@@ -11,6 +11,7 @@ test("benchCommand - scenario file only", () => {
   assert.ok(result.success);
   if (result.success) {
     assert.strictEqual(result.value.command, COMMAND);
+    if (result.value.mode !== "run") assert.fail("Expected run mode.");
     assert.strictEqual(result.value.scenario, FILE);
     assert.strictEqual(result.value.target, undefined);
     assert.strictEqual(result.value.format, "text");
@@ -39,6 +40,7 @@ test("benchCommand - with all options", () => {
   ]);
   assert.ok(result.success);
   if (result.success) {
+    if (result.value.mode !== "run") assert.fail("Expected run mode.");
     assert.strictEqual(result.value.scenario, FILE);
     assert.strictEqual(result.value.target, "http://localhost:3000");
     assert.strictEqual(result.value.format, "json");
@@ -54,6 +56,69 @@ test("benchCommand - missing scenario file fails", () => {
   assert.ok(!result.success);
 });
 
+test("benchCommand - compare mode", () => {
+  const result = parse(benchCommand, [
+    COMMAND,
+    "compare",
+    "--base",
+    "origin/main",
+    "--head",
+    "HEAD",
+    "--file",
+    FILE,
+    "--start-command",
+    "pnpm dev",
+    "--ready-url",
+    "http://127.0.0.1:3000/health",
+    "--ready-timeout",
+    "45s",
+    "--max-regression",
+    "15%",
+    "--format",
+    "markdown",
+    "--target",
+    "http://127.0.0.1:3000",
+    "--allow-unsafe-target",
+    "-u",
+    "MyAgent/1.0",
+  ]);
+  assert.ok(result.success);
+  if (result.success) {
+    assert.strictEqual(result.value.command, COMMAND);
+    if (result.value.mode !== "compare") {
+      assert.fail("Expected compare mode.");
+    }
+    assert.strictEqual(result.value.base, "origin/main");
+    assert.strictEqual(result.value.head, "HEAD");
+    assert.strictEqual(result.value.file, FILE);
+    assert.strictEqual(result.value.startCommand, "pnpm dev");
+    assert.strictEqual(
+      result.value.readyUrl,
+      "http://127.0.0.1:3000/health",
+    );
+    assert.strictEqual(result.value.readyTimeout, "45s");
+    assert.strictEqual(result.value.maxRegression, "15%");
+    assert.strictEqual(result.value.format, "markdown");
+    assert.strictEqual(result.value.target, "http://127.0.0.1:3000");
+    assert.strictEqual(result.value.allowUnsafeTarget, true);
+    assert.strictEqual(result.value.userAgent, "MyAgent/1.0");
+  }
+});
+
+test("benchCommand - compare mode requires refs", () => {
+  const result = parse(benchCommand, [
+    COMMAND,
+    "compare",
+    "--file",
+    FILE,
+    "--start-command",
+    "pnpm dev",
+    "--ready-url",
+    "http://127.0.0.1:3000/health",
+  ]);
+  assert.ok(!result.success);
+});
+
 test("benchCommand - invalid format value fails", () => {
   const result = parse(benchCommand, [COMMAND, FILE, "--format", "xml"]);
   assert.ok(!result.success);
diff --git a/packages/cli/src/bench/command.ts b/packages/cli/src/bench/command.ts
index 1104753b3..fb5e02d41 100644
--- a/packages/cli/src/bench/command.ts
+++ b/packages/cli/src/bench/command.ts
@@ -12,6 +12,7 @@ import {
   object,
   option,
   optional,
+  or,
   string,
   withDefault,
 } from "@optique/core";
@@ -47,52 +48,120 @@ set in a configuration file.`,
   false,
 );
 
-export const benchCommand = command(
-  "bench",
+const outputOption = optional(
+  option("-o", "--output", string({ metavar: "OUTPUT_PATH" }), {
+    description:
+      message`Write the report to a file instead of standard output.`,
+  }),
+);
+
+const targetOption = optional(
+  option("-t", "--target", string({ metavar: "URL" }), {
+    description: message`Override the target URL declared in the suite.`,
+  }),
+);
+
+const advertiseHostOption = optional(
+  option("--advertise-host", string({ metavar: "HOST" }), {
+    description: message`Host (name or IP) a non-loopback target can reach the \
+benchmark's synthetic actor server at.  Required for signed scenarios against a \
+non-loopback target; binds the synthetic server on all interfaces and uses this \
+host in the actor and key URLs the target dereferences.`,
+  }),
+);
+
+const runParser = merge(
+  "Benchmark options",
+  object({
+    command: constant("bench"),
+    mode: constant("run"),
+    scenario: group(
+      "Arguments",
+      argument(string({ metavar: "SCENARIO_FILE" }), {
+        description: message`Path to the benchmark suite file (YAML or JSON).`,
+      }),
+    ),
+    target: targetOption,
+    format: formatOption,
+    output: outputOption,
+    dryRun: withDefault(
+      flag("--dry-run", {
+        description:
+          message`Resolve discovery and print the benchmark plan without \
+sending load.`,
+      }),
+      false,
+    ),
+    advertiseHost: advertiseHostOption,
+    allowUnsafeTarget,
+  }),
+  userAgentOption,
+);
+
+const compareParser = command(
+  "compare",
   merge(
-    "Benchmark options",
+    "Compare options",
     object({
       command: constant("bench"),
-      scenario: group(
-        "Arguments",
-        argument(string({ metavar: "SCENARIO_FILE" }), {
-          description:
-            message`Path to the benchmark suite file (YAML or JSON).`,
-        }),
-      ),
-      target: optional(
-        option("-t", "--target", string({ metavar: "URL" }), {
-          description: message`Override the target URL declared in the suite.`,
-        }),
-      ),
-      format: formatOption,
-      output: optional(
-        option("-o", "--output", string({ metavar: "OUTPUT_PATH" }), {
+      mode: constant("compare"),
+      base: option("--base", string({ metavar: "REF" }), {
+        description: message`The base git ref to benchmark.`,
+      }),
+      head: option("--head", string({ metavar: "REF" }), {
+        description: message`The head git ref to benchmark.`,
+      }),
+      file: option("--file", string({ metavar: "SCENARIO_FILE" }), {
+        description: message`Path to the benchmark suite file (YAML or JSON).`,
+      }),
+      startCommand: option(
+        "--start-command",
+        string({ metavar: "COMMAND" }),
+        {
           description:
-            message`Write the report to a file instead of standard output.`,
-        }),
+            message`Shell command that starts the target application in each \
+checked-out worktree.`,
+        },
       ),
-      dryRun: withDefault(
-        flag("--dry-run", {
-          description:
-            message`Resolve discovery and print the benchmark plan without \
-sending load.`,
+      readyUrl: option("--ready-url", string({ metavar: "URL" }), {
+        description:
+          message`URL that returns success when the started target is ready.`,
+      }),
+      readyTimeout: withDefault(
+        option("--ready-timeout", string({ metavar: "DURATION" }), {
+          description: message`How long to wait for --ready-url.`,
         }),
-        false,
+        "30s",
       ),
-      advertiseHost: optional(
-        option("--advertise-host", string({ metavar: "HOST" }), {
+      maxRegression: option(
+        "--max-regression",
+        string({ metavar: "PERCENT" }),
+        {
           description:
-            message`Host (name or IP) a non-loopback target can reach the \
-benchmark's synthetic actor server at.  Required for signed scenarios against a \
-non-loopback target; binds the synthetic server on all interfaces and uses this \
-host in the actor and key URLs the target dereferences.`,
-        }),
+            message`Maximum regression tolerated after the measured noise band.`,
+        },
       ),
+      target: targetOption,
+      format: formatOption,
+      output: outputOption,
+      dryRun: constant(false),
+      advertiseHost: advertiseHostOption,
       allowUnsafeTarget,
     }),
     userAgentOption,
   ),
+  {
+    brief: message`Compare base and head benchmark runs.`,
+    description:
+      message`Run the same benchmark suite against two git revisions on the \
+same runner, then fail when the head revision regresses beyond the configured \
+tolerance and measured noise band.`,
+  },
+);
+
+export const benchCommand = command(
+  "bench",
+  or(compareParser, runParser),
   {
     brief: message`Benchmark a Fedify federation workload.`,
     description: message`Run an ActivityPub-specific load benchmark against a \
@@ -106,3 +175,5 @@ the suite format.`,
 );
 
 export type BenchCommand = InferValue<typeof benchCommand>;
+export type BenchRunCommand = Extract<BenchCommand, { mode: "run" }>;
+export type BenchCompareCommand = Extract<BenchCommand, { mode: "compare" }>;
diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
new file mode 100644
index 000000000..428373aa6
--- /dev/null
+++ b/packages/cli/src/bench/compare.test.ts
@@ -0,0 +1,235 @@
+import assert from "node:assert/strict";
+import test from "node:test";
+import type { BenchCompareCommand } from "./command.ts";
+import {
+  buildCompareReport,
+  parseRegressionTolerance,
+  runBenchCompare,
+} from "./compare.ts";
+import type { BenchReport, ScenarioResult } from "./result/model.ts";
+
+function scenario(
+  overrides: Partial<ScenarioResult> & { name?: string } = {},
+): ScenarioResult {
+  const base: ScenarioResult = {
+    name: "inbox-shared",
+    type: "inbox",
+    load: {
+      model: "closed",
+      concurrency: 1,
+      durationMs: 100,
+      warmupMs: 0,
+    },
+    requests: { total: 10, ok: 10, failed: 0, successRate: 1 },
+    throughputPerSec: 100,
+    client: {
+      latencyMs: { p50: 50, p95: 100, p99: 110, mean: 60, max: 120 },
+    },
+    server: null,
+    errors: [],
+    expectations: [{
+      metric: "latency.p95",
+      op: "lt",
+      threshold: 250,
+      unit: "ms",
+      actual: 100,
+      severity: "fail",
+      pass: true,
+    }],
+    passed: true,
+    runCount: 3,
+    runs: [
+      runResult(90, 100),
+      runResult(100, 100),
+      runResult(110, 100),
+    ],
+  };
+  return { ...base, ...overrides, name: overrides.name ?? base.name };
+}
+
+function runResult(latencyP95: number, throughput: number) {
+  return {
+    run: 1,
+    requests: { total: 10, ok: 10, failed: 0, successRate: 1 },
+    throughputPerSec: throughput,
+    client: {
+      latencyMs: {
+        p50: latencyP95 / 2,
+        p95: latencyP95,
+        p99: latencyP95,
+        mean: latencyP95 / 2,
+        max: latencyP95,
+      },
+    },
+    server: null,
+    errors: [],
+  };
+}
+
+function report(scenarios: ScenarioResult[]): BenchReport {
+  return {
+    $schema: "https://json-schema.fedify.dev/bench/report-v3.json",
+    schemaVersion: 3,
+    tool: { name: "@fedify/cli", version: "2.3.0" },
+    environment: {
+      runtime: "node",
+      runtimeVersion: "22.0.0",
+      os: "linux",
+      cpuCount: 8,
+    },
+    target: { url: "http://127.0.0.1:3000", statsAvailable: true },
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+    suite: { configHash: "sha256:x" },
+    passed: scenarios.every((s) => s.passed),
+    scenarios,
+  };
+}
+
+function command(overrides: Partial<BenchCompareCommand>): BenchCompareCommand {
+  return {
+    command: "bench",
+    mode: "compare",
+    base: "origin/main",
+    head: "HEAD",
+    file: "scenario.yaml",
+    startCommand: "pnpm dev",
+    readyUrl: "http://127.0.0.1:3000/health",
+    readyTimeout: "30s",
+    maxRegression: "15%",
+    target: undefined,
+    format: "json",
+    output: undefined,
+    dryRun: false,
+    advertiseHost: undefined,
+    allowUnsafeTarget: false,
+    userAgent: "Fedify-bench-test/1.0",
+    ...overrides,
+  };
+}
+
+test("parseRegressionTolerance - parses percentages", () => {
+  assert.strictEqual(parseRegressionTolerance("15%"), 0.15);
+  assert.strictEqual(parseRegressionTolerance("0.2"), 0.2);
+});
+
+test("parseRegressionTolerance - rejects malformed values", () => {
+  assert.throws(() => parseRegressionTolerance("15ms"), RangeError);
+  assert.throws(() => parseRegressionTolerance("-1%"), RangeError);
+  assert.throws(() => parseRegressionTolerance(""), RangeError);
+});
+
+test("buildCompareReport - applies the measured noise band", () => {
+  const base = report([scenario()]);
+  const head = report([
+    scenario({
+      client: {
+        latencyMs: { p50: 50, p95: 114, p99: 120, mean: 60, max: 130 },
+      },
+      runs: [runResult(113, 100), runResult(114, 100), runResult(115, 100)],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0.05,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.strictEqual(compare.comparisons.length, 1);
+  assert.strictEqual(compare.comparisons[0].metric, "latency.p95");
+  assert.ok(compare.comparisons[0].pass);
+  assert.strictEqual(compare.passed, true);
+});
+
+test("buildCompareReport - fails regressions outside tolerance and noise", () => {
+  const base = report([
+    scenario({
+      expectations: [],
+      runs: [runResult(100, 100), runResult(100, 100), runResult(100, 100)],
+    }),
+  ]);
+  const head = report([
+    scenario({
+      expectations: [],
+      throughputPerSec: 80,
+      runs: [runResult(100, 80), runResult(100, 80), runResult(100, 80)],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0.1,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  const throughput = compare.comparisons.find((c) =>
+    c.metric === "throughputPerSec"
+  );
+  assert.ok(throughput);
+  assert.strictEqual(throughput.pass, false);
+  assert.strictEqual(compare.passed, false);
+});
+
+test("runBenchCompare - orchestrates worktrees and cleans up", async () => {
+  const events: string[] = [];
+  let code = -1;
+  let output = "";
+  await runBenchCompare(command({ maxRegression: "10%" }), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: (content) => {
+      output = content;
+      return Promise.resolve();
+    },
+    log: (message) => events.push(`log:${message}`),
+    createWorktree: (ref, label) => {
+      events.push(`worktree:${label}:${ref}`);
+      return Promise.resolve(`/tmp/${label}`);
+    },
+    removeWorktree: (path) => {
+      events.push(`remove:${path}`);
+      return Promise.resolve();
+    },
+    startTarget: (cwd, startCommand) => {
+      events.push(`start:${cwd}:${startCommand}`);
+      return Promise.resolve({
+        stop: () => {
+          events.push(`stop:${cwd}`);
+          return Promise.resolve();
+        },
+      });
+    },
+    waitReady: (url, timeoutMs) => {
+      events.push(`ready:${url.href}:${timeoutMs}`);
+      return Promise.resolve();
+    },
+    runBenchInWorktree: ({ cwd, target }) => {
+      events.push(`bench:${cwd}:${target}`);
+      return Promise.resolve(report([scenario()]));
+    },
+  });
+  assert.strictEqual(code, 0);
+  assert.strictEqual(JSON.parse(output).passed, true);
+  assert.deepEqual(events, [
+    "log:Checking out base benchmark ref origin/main…",
+    "worktree:base:origin/main",
+    "start:/tmp/base:pnpm dev",
+    "ready:http://127.0.0.1:3000/health:30000",
+    "bench:/tmp/base:http://127.0.0.1:3000",
+    "stop:/tmp/base",
+    "log:Checking out head benchmark ref HEAD…",
+    "worktree:head:HEAD",
+    "start:/tmp/head:pnpm dev",
+    "ready:http://127.0.0.1:3000/health:30000",
+    "bench:/tmp/head:http://127.0.0.1:3000",
+    "stop:/tmp/head",
+    "remove:/tmp/head",
+    "remove:/tmp/base",
+  ]);
+});
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
new file mode 100644
index 000000000..da5390296
--- /dev/null
+++ b/packages/cli/src/bench/compare.ts
@@ -0,0 +1,623 @@
+import { type ChildProcess, spawn } from "node:child_process";
+import { mkdtemp, rm } from "node:fs/promises";
+import { writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import process from "node:process";
+import type { BenchCompareCommand, BenchRunCommand } from "./command.ts";
+import runBench from "./action.ts";
+import type { RunBenchDeps } from "./action.ts";
+import { COMPARE_REPORT_SCHEMA_ID } from "./compare/schema.ts";
+import { parseDuration } from "./scenario/units.ts";
+import type {
+  BenchReport,
+  ScenarioResult,
+  ScenarioRunResult,
+} from "./result/model.ts";
+import { metricUnit } from "./result/expect/metrics.ts";
+import { describeError } from "../utils.ts";
+
+/** A benchmark comparison report. */
+export interface BenchCompareReport {
+  readonly $schema: string;
+  readonly schemaVersion: 1;
+  readonly tool: BenchReport["tool"];
+  readonly environment: BenchReport["environment"];
+  readonly startedAt: string;
+  readonly finishedAt: string;
+  readonly suite: BenchReport["suite"];
+  readonly maxRegression: number;
+  readonly base: CompareSide;
+  readonly head: CompareSide;
+  readonly comparisons: ComparisonResult[];
+  readonly passed: boolean;
+}
+
+/** One side of a comparison. */
+export interface CompareSide {
+  readonly ref: string;
+  readonly report: BenchReport;
+}
+
+/** One metric comparison between base and head. */
+export interface ComparisonResult {
+  readonly scenario: string;
+  readonly metric: string;
+  readonly direction: "lower-is-better" | "higher-is-better";
+  readonly base: number | null;
+  readonly head: number | null;
+  readonly regression: number | null;
+  readonly noiseBand: number;
+  readonly allowedRegression: number;
+  readonly pass: boolean;
+}
+
+/** Dependencies injectable for tests. */
+export interface RunBenchCompareDeps {
+  readonly exit?: (code: number) => void;
+  readonly writeOutput?: (
+    content: string,
+    outputPath: string | undefined,
+  ) => Promise<void>;
+  readonly log?: (message: string) => void;
+  readonly createWorktree?: (
+    ref: string,
+    label: "base" | "head",
+  ) => Promise<string>;
+  readonly removeWorktree?: (path: string) => Promise<void>;
+  readonly startTarget?: (
+    cwd: string,
+    startCommand: string,
+  ) => Promise<StartedTarget>;
+  readonly waitReady?: (url: URL, timeoutMs: number) => Promise<void>;
+  readonly runBenchInWorktree?: (
+    input: RunBenchInWorktreeInput,
+  ) => Promise<BenchReport>;
+  readonly benchDeps?: RunBenchDeps;
+}
+
+/** A started target process. */
+export interface StartedTarget {
+  stop(): Promise<void>;
+}
+
+/** Input to a worktree-local benchmark run. */
+export interface RunBenchInWorktreeInput {
+  readonly cwd: string;
+  readonly command: BenchCompareCommand;
+  readonly target: string;
+}
+
+/** Runs `fedify bench compare`. */
+export async function runBenchCompare(
+  command: BenchCompareCommand,
+  deps: RunBenchCompareDeps = {},
+): Promise<void> {
+  const exit = deps.exit ?? ((code: number) => {
+    process.exitCode = code;
+  });
+  const writeOutput = deps.writeOutput ?? defaultWriteOutput;
+  const log = deps.log ??
+    ((message: string) => process.stderr.write(`${message}\n`));
+  const createWorktree = deps.createWorktree ?? defaultCreateWorktree;
+  const removeWorktree = deps.removeWorktree ?? defaultRemoveWorktree;
+  const startTarget = deps.startTarget ?? defaultStartTarget;
+  const waitReady = deps.waitReady ?? defaultWaitReady;
+  const runBenchInWorktree = deps.runBenchInWorktree ??
+    ((input) => defaultRunBenchInWorktree(input, deps.benchDeps));
+
+  let readyUrl: URL;
+  let readyTimeoutMs: number;
+  let maxRegression: number;
+  try {
+    readyUrl = new URL(command.readyUrl);
+    readyTimeoutMs = parseDuration(command.readyTimeout);
+    maxRegression = parseRegressionTolerance(command.maxRegression);
+  } catch (error) {
+    log(describeError(error));
+    return void exit(2);
+  }
+  const target = command.target ?? new URL("/", readyUrl).origin;
+  const worktrees: string[] = [];
+  const startedAt = new Date().toISOString();
+  try {
+    const baseReport = await runSide("base", command.base);
+    const headReport = await runSide("head", command.head);
+    const report = buildCompareReport({
+      baseRef: command.base,
+      headRef: command.head,
+      baseReport,
+      headReport,
+      maxRegression,
+      startedAt,
+      finishedAt: new Date().toISOString(),
+    });
+    await writeOutput(
+      renderCompareReport(report, command.format),
+      command.output,
+    );
+    return void exit(report.passed ? 0 : 1);
+  } catch (error) {
+    log(describeError(error));
+    return void exit(2);
+  } finally {
+    for (const path of worktrees.toReversed()) {
+      try {
+        await removeWorktree(path);
+      } catch (error) {
+        log(
+          `Failed to remove benchmark worktree ${path}: ${
+            describeError(error)
+          }`,
+        );
+      }
+    }
+  }
+
+  async function runSide(
+    label: "base" | "head",
+    ref: string,
+  ): Promise<BenchReport> {
+    log(`Checking out ${label} benchmark ref ${ref}…`);
+    const cwd = await createWorktree(ref, label);
+    worktrees.push(cwd);
+    const targetProcess = await startTarget(cwd, command.startCommand);
+    try {
+      await waitReady(readyUrl, readyTimeoutMs);
+      return await runBenchInWorktree({ cwd, command, target });
+    } finally {
+      await targetProcess.stop();
+    }
+  }
+}
+
+/** Parses `--max-regression`, accepting ratios or percentages. */
+export function parseRegressionTolerance(value: string): number {
+  const trimmed = value.trim();
+  const match = /^(\d+(?:\.\d+)?|\.\d+)(%)?$/.exec(trimmed);
+  const numeric = match == null ? NaN : Number(match[1]);
+  if (!Number.isFinite(numeric) || numeric < 0) {
+    throw new RangeError(
+      `Invalid --max-regression value: ${JSON.stringify(value)}.`,
+    );
+  }
+  return match?.[2] === "%" ? numeric / 100 : numeric;
+}
+
+/** Builds a compare report from two benchmark reports. */
+export function buildCompareReport(input: {
+  readonly baseRef: string;
+  readonly headRef: string;
+  readonly baseReport: BenchReport;
+  readonly headReport: BenchReport;
+  readonly maxRegression: number;
+  readonly startedAt: string;
+  readonly finishedAt: string;
+}): BenchCompareReport {
+  const comparisons = compareReports(
+    input.baseReport,
+    input.headReport,
+    input.maxRegression,
+  );
+  return {
+    $schema: COMPARE_REPORT_SCHEMA_ID,
+    schemaVersion: 1,
+    tool: input.headReport.tool,
+    environment: input.headReport.environment,
+    startedAt: input.startedAt,
+    finishedAt: input.finishedAt,
+    suite: input.headReport.suite,
+    maxRegression: input.maxRegression,
+    base: { ref: input.baseRef, report: input.baseReport },
+    head: { ref: input.headRef, report: input.headReport },
+    comparisons,
+    passed: input.headReport.passed && comparisons.every((c) => c.pass),
+  };
+}
+
+function compareReports(
+  base: BenchReport,
+  head: BenchReport,
+  maxRegression: number,
+): ComparisonResult[] {
+  const baseScenarios = new Map(base.scenarios.map((s) => [s.name, s]));
+  const results: ComparisonResult[] = [];
+  for (const headScenario of head.scenarios) {
+    const baseScenario = baseScenarios.get(headScenario.name);
+    if (baseScenario == null || baseScenario.type !== headScenario.type) {
+      results.push(missingScenario(headScenario.name, maxRegression));
+      continue;
+    }
+    for (const metric of comparisonMetrics(headScenario)) {
+      results.push(
+        compareMetric(baseScenario, headScenario, metric, maxRegression),
+      );
+    }
+  }
+  return results;
+}
+
+function comparisonMetrics(scenario: ScenarioResult): string[] {
+  const fromExpect = scenario.expectations
+    .map((e) => e.metric)
+    .filter(isPerformanceMetric);
+  return [
+    ...new Set(
+      fromExpect.length < 1 ? ["latency.p95", "throughputPerSec"] : fromExpect,
+    ),
+  ];
+}
+
+function isPerformanceMetric(metric: string): boolean {
+  const unit = metricUnit(metric);
+  return unit === "ms" || unit === "rate";
+}
+
+function compareMetric(
+  baseScenario: ScenarioResult,
+  headScenario: ScenarioResult,
+  metric: string,
+  maxRegression: number,
+): ComparisonResult {
+  const direction = metricUnit(metric) === "rate"
+    ? "higher-is-better"
+    : "lower-is-better";
+  const base = metricValue(baseScenario, metric);
+  const head = metricValue(headScenario, metric);
+  const noiseBand = Math.max(
+    relativeNoise(baseScenario, metric),
+    relativeNoise(headScenario, metric),
+  );
+  const regression = base == null || head == null
+    ? null
+    : regressionRatio(base, head, direction);
+  const allowedRegression = maxRegression + noiseBand;
+  return {
+    scenario: headScenario.name,
+    metric,
+    direction,
+    base,
+    head,
+    regression,
+    noiseBand,
+    allowedRegression,
+    pass: regression != null && regression <= allowedRegression,
+  };
+}
+
+function missingScenario(
+  scenario: string,
+  maxRegression: number,
+): ComparisonResult {
+  return {
+    scenario,
+    metric: "scenario",
+    direction: "lower-is-better",
+    base: null,
+    head: null,
+    regression: null,
+    noiseBand: 0,
+    allowedRegression: maxRegression,
+    pass: false,
+  };
+}
+
+function metricValue(
+  scenario: ScenarioResult | ScenarioRunResult,
+  metric: string,
+): number | null {
+  switch (metric) {
+    case "throughputPerSec":
+      return scenario.throughputPerSec;
+    case "deliveryThroughput":
+      return scenario.deliveryThroughputPerSec ?? null;
+  }
+  if (metric.startsWith("latency.")) {
+    return latencyValue(scenario.client.latencyMs, metric.slice(8));
+  }
+  if (metric.startsWith("signatureVerification.")) {
+    return partialValue(
+      scenario.server?.signatureVerificationMs?.overall,
+      metric.slice("signatureVerification.".length),
+    );
+  }
+  if (metric.startsWith("queueDrain.")) {
+    return partialValue(
+      scenario.server?.queue?.drainMs,
+      metric.slice("queueDrain.".length),
+    );
+  }
+  return null;
+}
+
+function latencyValue(
+  latency: ScenarioResult["client"]["latencyMs"],
+  field: string,
+): number | null {
+  switch (field) {
+    case "p50":
+      return latency.p50;
+    case "p95":
+      return latency.p95;
+    case "p99":
+      return latency.p99;
+    case "mean":
+      return latency.mean;
+    case "max":
+      return latency.max;
+    default:
+      return null;
+  }
+}
+
+function partialValue(
+  latency: {
+    readonly p50?: number;
+    readonly p95?: number;
+    readonly p99?: number;
+  } | undefined,
+  field: string,
+): number | null {
+  switch (field) {
+    case "p50":
+      return latency?.p50 ?? null;
+    case "p95":
+      return latency?.p95 ?? null;
+    case "p99":
+      return latency?.p99 ?? null;
+    default:
+      return null;
+  }
+}
+
+function regressionRatio(
+  base: number,
+  head: number,
+  direction: ComparisonResult["direction"],
+): number | null {
+  if (!Number.isFinite(base) || !Number.isFinite(head) || base <= 0) {
+    return base === head ? 0 : null;
+  }
+  return direction === "higher-is-better"
+    ? (base - head) / base
+    : (head - base) / base;
+}
+
+function relativeNoise(scenario: ScenarioResult, metric: string): number {
+  const values = (scenario.runs ?? [])
+    .map((run) => metricValue(run, metric))
+    .filter((value): value is number =>
+      value != null && Number.isFinite(value)
+    );
+  if (values.length < 2) return 0;
+  const medianValue = median(values);
+  if (medianValue <= 0) {
+    return Math.max(...values) === Math.min(...values) ? 0 : Infinity;
+  }
+  return (Math.max(...values) - Math.min(...values)) / (2 * medianValue);
+}
+
+function median(values: readonly number[]): number {
+  const sorted = [...values].sort((a, b) => a - b);
+  const middle = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 1) return sorted[middle];
+  return (sorted[middle - 1] + sorted[middle]) / 2;
+}
+
+function renderCompareReport(
+  report: BenchCompareReport,
+  format: BenchCompareCommand["format"],
+): string {
+  switch (format) {
+    case "json":
+      return `${JSON.stringify(report, null, 2)}\n`;
+    case "markdown":
+      return renderCompareMarkdown(report);
+    case "text":
+      return renderCompareText(report);
+  }
+  throw new RangeError(`Unsupported benchmark report format: ${format}.`);
+}
+
+function renderCompareText(report: BenchCompareReport): string {
+  const lines = [
+    "Fedify benchmark comparison",
+    "",
+    `Base: ${report.base.ref}`,
+    `Head: ${report.head.ref}`,
+    `Maximum regression: ${formatPercent(report.maxRegression)}`,
+    "",
+  ];
+  for (const comparison of report.comparisons) {
+    lines.push(
+      `[${comparison.pass ? "PASS" : "FAIL"}] ${comparison.scenario} ` +
+        `${comparison.metric}: base ${formatNumberOrNull(comparison.base)}, ` +
+        `head ${formatNumberOrNull(comparison.head)}, regression ${
+          formatNumberOrNull(comparison.regression, formatPercent)
+        }, noise ${formatPercent(comparison.noiseBand)}`,
+    );
+  }
+  lines.push("", `Overall: ${report.passed ? "PASS" : "FAIL"}`);
+  return `${lines.join("\n")}\n`;
+}
+
+function renderCompareMarkdown(report: BenchCompareReport): string {
+  const lines = [
+    "# Fedify benchmark comparison",
+    "",
+    `**Result:** ${report.passed ? "PASS" : "FAIL"}`,
+    "",
+    `- **Base:** \`${report.base.ref}\``,
+    `- **Head:** \`${report.head.ref}\``,
+    `- **Maximum regression:** ${formatPercent(report.maxRegression)}`,
+    "",
+    "| Scenario | Metric | Base | Head | Regression | Noise | Result |",
+    "| --- | --- | --- | --- | --- | --- | --- |",
+  ];
+  for (const comparison of report.comparisons) {
+    lines.push(
+      `| ${comparison.scenario} | \`${comparison.metric}\` | ${
+        formatNumberOrNull(comparison.base)
+      } | ${formatNumberOrNull(comparison.head)} | ${
+        formatNumberOrNull(comparison.regression, formatPercent)
+      } | ${formatPercent(comparison.noiseBand)} | ${
+        comparison.pass ? "PASS" : "FAIL"
+      } |`,
+    );
+  }
+  return `${lines.join("\n")}\n`;
+}
+
+function formatNumberOrNull(
+  value: number | null,
+  formatter: (value: number) => string = formatNumber,
+): string {
+  return value == null ? "n/a" : formatter(value);
+}
+
+function formatNumber(value: number): string {
+  if (!Number.isFinite(value)) return String(value);
+  return Number.isInteger(value) ? String(value) : value.toFixed(3);
+}
+
+function formatPercent(value: number): string {
+  if (!Number.isFinite(value)) return String(value);
+  return `${(value * 100).toFixed(1)}%`;
+}
+
+async function defaultRunBenchInWorktree(
+  input: RunBenchInWorktreeInput,
+  benchDeps: RunBenchDeps = {},
+): Promise<BenchReport> {
+  let output = "";
+  let exitCode = 0;
+  const runCommand: BenchRunCommand = {
+    command: "bench",
+    mode: "run",
+    scenario: input.command.file,
+    target: input.target,
+    format: "json",
+    output: undefined,
+    dryRun: false,
+    advertiseHost: input.command.advertiseHost,
+    allowUnsafeTarget: input.command.allowUnsafeTarget,
+    userAgent: input.command.userAgent,
+  };
+  await runBench(runCommand, {
+    ...benchDeps,
+    exit: (code) => {
+      exitCode = code;
+    },
+    writeOutput: (content) => {
+      output = content;
+      return Promise.resolve();
+    },
+  });
+  if (exitCode === 2 || output.trim() === "") {
+    throw new Error(`Benchmark run failed for ${input.cwd}.`);
+  }
+  return JSON.parse(output) as BenchReport;
+}
+
+async function defaultCreateWorktree(
+  ref: string,
+  label: "base" | "head",
+): Promise<string> {
+  const path = await mkdtemp(join(tmpdir(), `fedify-bench-${label}-`));
+  await rm(path, { recursive: true, force: true });
+  await runGit(["worktree", "add", "--detach", path, ref]);
+  return path;
+}
+
+async function defaultRemoveWorktree(path: string): Promise<void> {
+  await runGit(["worktree", "remove", "--force", path]);
+}
+
+function runGit(args: readonly string[]): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const child = spawn("git", args, { stdio: "ignore" });
+    child.on("error", reject);
+    child.on("close", (code) => {
+      if (code === 0) resolve();
+      else reject(new Error(`git ${args.join(" ")} exited with code ${code}`));
+    });
+  });
+}
+
+function defaultStartTarget(
+  cwd: string,
+  startCommand: string,
+): Promise<StartedTarget> {
+  const child = spawn(startCommand, {
+    cwd,
+    detached: process.platform !== "win32",
+    shell: true,
+    stdio: "inherit",
+    env: process.env,
+  });
+  return Promise.resolve({
+    stop: () => stopProcess(child),
+  });
+}
+
+function stopProcess(child: ChildProcess): Promise<void> {
+  return new Promise((resolve) => {
+    if (child.exitCode != null || child.signalCode != null) {
+      resolve();
+      return;
+    }
+    const timer = setTimeout(() => {
+      killTargetProcess(child, "SIGKILL");
+    }, 5000);
+    child.once("exit", () => {
+      clearTimeout(timer);
+      resolve();
+    });
+    killTargetProcess(child, "SIGTERM");
+  });
+}
+
+function killTargetProcess(
+  child: ChildProcess,
+  signal: NodeJS.Signals,
+): void {
+  if (child.pid == null || process.platform === "win32") {
+    child.kill(signal);
+    return;
+  }
+  try {
+    process.kill(-child.pid, signal);
+  } catch {
+    child.kill(signal);
+  }
+}
+
+async function defaultWaitReady(url: URL, timeoutMs: number): Promise<void> {
+  const deadline = Date.now() + timeoutMs;
+  let lastError: unknown;
+  while (Date.now() <= deadline) {
+    try {
+      const response = await fetch(url);
+      await response.arrayBuffer().catch(() => {});
+      if (response.status >= 200 && response.status < 400) return;
+      lastError = new Error(`ready URL returned ${response.status}`);
+    } catch (error) {
+      lastError = error;
+    }
+    await new Promise((resolve) => setTimeout(resolve, 250));
+  }
+  throw new Error(
+    `Timed out waiting for ${url.href}: ${describeError(lastError)}.`,
+  );
+}
+
+async function defaultWriteOutput(
+  content: string,
+  outputPath: string | undefined,
+): Promise<void> {
+  if (outputPath == null) {
+    process.stdout.write(content.endsWith("\n") ? content : `${content}\n`);
+    return;
+  }
+  await writeFile(outputPath, content, { encoding: "utf-8" });
+}
diff --git a/packages/cli/src/bench/compare/schema.ts b/packages/cli/src/bench/compare/schema.ts
new file mode 100644
index 000000000..b27f4a57d
--- /dev/null
+++ b/packages/cli/src/bench/compare/schema.ts
@@ -0,0 +1,146 @@
+/**
+ * The embedded JSON Schema (draft 2020-12) for benchmark comparison output.
+ *
+ * The comparison report embeds the two benchmark reports it compares; this
+ * schema validates the comparison envelope and checks that the embedded reports
+ * look like current benchmark reports without duplicating the complete report
+ * schema in two published files.
+ * @since 2.3.0
+ * @module
+ */
+
+/** The hosted URL that serves the comparison report schema. */
+export const COMPARE_REPORT_SCHEMA_ID =
+  "https://json-schema.fedify.dev/bench/compare-report-v1.json";
+
+/** The benchmark comparison report JSON Schema (draft 2020-12). */
+export const compareReportSchemaV1 = {
+  $schema: "https://json-schema.org/draft/2020-12/schema",
+  $id: COMPARE_REPORT_SCHEMA_ID,
+  title: "Fedify benchmark comparison report",
+  type: "object",
+  additionalProperties: false,
+  required: [
+    "schemaVersion",
+    "tool",
+    "environment",
+    "startedAt",
+    "finishedAt",
+    "suite",
+    "maxRegression",
+    "base",
+    "head",
+    "comparisons",
+    "passed",
+  ],
+  properties: {
+    $schema: { type: "string" },
+    schemaVersion: { const: 1 },
+    tool: { $ref: "#/$defs/tool" },
+    environment: { $ref: "#/$defs/environment" },
+    startedAt: { type: "string" },
+    finishedAt: { type: "string" },
+    suite: { $ref: "#/$defs/suite" },
+    maxRegression: { type: "number", minimum: 0 },
+    base: { $ref: "#/$defs/compareSide" },
+    head: { $ref: "#/$defs/compareSide" },
+    comparisons: {
+      type: "array",
+      items: { $ref: "#/$defs/comparisonResult" },
+    },
+    passed: { type: "boolean" },
+  },
+  $defs: {
+    tool: {
+      type: "object",
+      additionalProperties: false,
+      required: ["name", "version"],
+      properties: {
+        name: { type: "string" },
+        version: { type: "string" },
+      },
+    },
+    environment: {
+      type: "object",
+      additionalProperties: false,
+      required: ["runtime", "runtimeVersion", "os", "cpuCount"],
+      properties: {
+        runtime: { type: "string" },
+        runtimeVersion: { type: "string" },
+        os: { type: "string" },
+        cpuCount: { type: "integer", minimum: 0 },
+      },
+    },
+    suite: {
+      type: "object",
+      additionalProperties: false,
+      required: ["configHash"],
+      properties: {
+        name: { type: "string" },
+        configHash: { type: "string" },
+      },
+    },
+    benchmarkReport: {
+      type: "object",
+      additionalProperties: true,
+      required: [
+        "$schema",
+        "schemaVersion",
+        "tool",
+        "environment",
+        "target",
+        "suite",
+        "passed",
+        "scenarios",
+      ],
+      properties: {
+        $schema: {
+          const: "https://json-schema.fedify.dev/bench/report-v3.json",
+        },
+        schemaVersion: { const: 3 },
+        tool: { $ref: "#/$defs/tool" },
+        environment: { $ref: "#/$defs/environment" },
+        suite: { $ref: "#/$defs/suite" },
+        passed: { type: "boolean" },
+        scenarios: { type: "array" },
+      },
+    },
+    compareSide: {
+      type: "object",
+      additionalProperties: false,
+      required: ["ref", "report"],
+      properties: {
+        ref: { type: "string" },
+        report: { $ref: "#/$defs/benchmarkReport" },
+      },
+    },
+    comparisonResult: {
+      type: "object",
+      additionalProperties: false,
+      required: [
+        "scenario",
+        "metric",
+        "direction",
+        "base",
+        "head",
+        "regression",
+        "noiseBand",
+        "allowedRegression",
+        "pass",
+      ],
+      properties: {
+        scenario: { type: "string" },
+        metric: { type: "string" },
+        direction: {
+          enum: ["lower-is-better", "higher-is-better"],
+        },
+        base: { type: ["number", "null"] },
+        head: { type: ["number", "null"] },
+        regression: { type: ["number", "null"] },
+        noiseBand: { type: "number", minimum: 0 },
+        allowedRegression: { type: "number", minimum: 0 },
+        pass: { type: "boolean" },
+      },
+    },
+  },
+} as const;
diff --git a/packages/cli/src/bench/mod.ts b/packages/cli/src/bench/mod.ts
index 4ca1ef232..e263d2c64 100644
--- a/packages/cli/src/bench/mod.ts
+++ b/packages/cli/src/bench/mod.ts
@@ -1,2 +1,11 @@
-export { default as runBench } from "./action.ts";
+import runBenchSuite from "./action.ts";
+import { runBenchCompare } from "./compare.ts";
+import type { BenchCommand } from "./command.ts";
+
 export { benchCommand } from "./command.ts";
+
+export function runBench(command: BenchCommand): Promise<void> {
+  return command.mode === "compare"
+    ? runBenchCompare(command)
+    : runBenchSuite(command);
+}
diff --git a/packages/cli/src/bench/render/render.test.ts b/packages/cli/src/bench/render/render.test.ts
index d96b732bc..a534f37b1 100644
--- a/packages/cli/src/bench/render/render.test.ts
+++ b/packages/cli/src/bench/render/render.test.ts
@@ -5,7 +5,7 @@ import { dirname, join } from "node:path";
 import test from "node:test";
 import { fileURLToPath } from "node:url";
 import type { BenchReport } from "../result/model.ts";
-import { reportSchemaV2 } from "../result/schema.ts";
+import { reportSchemaV3 } from "../result/schema.ts";
 import { renderReport } from "./index.ts";
 
 // `import.meta.dirname` needs Node >= 20.11; derive it from the URL instead.
@@ -21,7 +21,7 @@ test("renderReport json - valid JSON that validates against the schema", () => {
   const json = renderReport(report, "json");
   const parsed = JSON.parse(json);
   const validator = new Validator(
-    reportSchemaV2 as unknown as Schema,
+    reportSchemaV3 as unknown as Schema,
     "2020-12",
   );
   assert.ok(validator.validate(parsed).valid);
diff --git a/packages/cli/src/bench/result/build.test.ts b/packages/cli/src/bench/result/build.test.ts
index 4a268dc57..3ed1d2ec3 100644
--- a/packages/cli/src/bench/result/build.test.ts
+++ b/packages/cli/src/bench/result/build.test.ts
@@ -9,7 +9,7 @@ import {
   detectEnvironment,
   type ScenarioMeasurement,
 } from "./build.ts";
-import { reportSchemaV2 } from "./schema.ts";
+import { reportSchemaV3 } from "./schema.ts";
 
 function resolvedInbox() {
   return normalizeSuite({
@@ -50,6 +50,7 @@ test("buildScenarioResult - summarizes load and evaluates expect", () => {
   assert.strictEqual(result.expectations.length, 2);
   assert.ok(result.expectations.every((e) => e.pass));
   assert.strictEqual(result.passed, true);
+  assert.strictEqual(result.runCount, 1);
 });
 
 test("buildScenarioResult - a run that measured nothing never passes", () => {
@@ -70,6 +71,64 @@ test("buildScenarioResult - preserves delivery throughput", () => {
   assert.strictEqual(result.deliveryThroughputPerSec, 42);
 });
 
+test("buildScenarioResult - aggregates repeated runs for CI gates", () => {
+  const scenario = normalizeSuite({
+    version: 1,
+    target: "http://localhost:3000",
+    defaults: {
+      load: { concurrency: 50 },
+      duration: "60s",
+      warmup: "10s",
+      runs: 3,
+    },
+    scenarios: [{
+      name: "inbox-shared",
+      type: "inbox",
+      recipient: "acct:a@x",
+      expect: {
+        successRate: ">= 95%",
+        "latency.p95": "< 250ms",
+        throughputPerSec: ">= 100/s",
+      },
+    }],
+  }).scenarios[0];
+  const result = buildScenarioResult(scenario, [
+    {
+      ...measurement(),
+      requests: { total: 10, ok: 10, failed: 0, successRate: 1 },
+      throughputPerSec: 90,
+      client: {
+        latencyMs: { p50: 10, p95: 100, p99: 110, mean: 20, max: 120 },
+      },
+    },
+    {
+      ...measurement(),
+      requests: { total: 10, ok: 9, failed: 1, successRate: 0.9 },
+      throughputPerSec: 100,
+      client: {
+        latencyMs: { p50: 20, p95: 200, p99: 210, mean: 30, max: 220 },
+      },
+    },
+    {
+      ...measurement(),
+      requests: { total: 10, ok: 10, failed: 0, successRate: 1 },
+      throughputPerSec: 200,
+      client: {
+        latencyMs: { p50: 30, p95: 300, p99: 310, mean: 40, max: 320 },
+      },
+    },
+  ]);
+  assert.strictEqual(result.runCount, 3);
+  assert.strictEqual(result.runs?.length, 3);
+  assert.strictEqual(result.client.latencyMs.p95, 200);
+  assert.strictEqual(result.throughputPerSec, 100);
+  assert.strictEqual(result.requests.successRate, 0.9);
+  assert.strictEqual(result.expectations[0].actual, 0.9);
+  assert.strictEqual(result.expectations[1].actual, 200);
+  assert.strictEqual(result.expectations[2].actual, 100);
+  assert.strictEqual(result.passed, false);
+});
+
 test("buildReport - gate passes only when all scenarios pass", () => {
   const ok = buildScenarioResult(resolvedInbox(), measurement());
   const bad = buildScenarioResult(resolvedInbox(), {
@@ -101,7 +160,7 @@ test("buildReport - output validates against the report schema", () => {
     suite: { name: "suite", configHash: configHash({ a: 1 }) },
   });
   const validator = new Validator(
-    reportSchemaV2 as unknown as Schema,
+    reportSchemaV3 as unknown as Schema,
     "2020-12",
   );
   const result = validator.validate(JSON.parse(JSON.stringify(report)));
diff --git a/packages/cli/src/bench/result/build.ts b/packages/cli/src/bench/result/build.ts
index 9e04d73b3..ef4ffe394 100644
--- a/packages/cli/src/bench/result/build.ts
+++ b/packages/cli/src/bench/result/build.ts
@@ -13,6 +13,7 @@ import { cpus } from "node:os";
 import process from "node:process";
 import metadata from "../../../deno.json" with { type: "json" };
 import type { ResolvedScenario } from "../scenario/normalize.ts";
+import { LogLinearHistogram } from "../metrics/histogram.ts";
 import type { SerializedHistogram } from "../metrics/histogram.ts";
 import { evaluateExpect } from "./expect/evaluate.ts";
 import { REPORT_SCHEMA_ID } from "./schema.ts";
@@ -24,6 +25,7 @@ import type {
   LoadSummary,
   RequestSummary,
   ScenarioResult,
+  ScenarioRunResult,
   ServerMetrics,
   TargetInfo,
 } from "./model.ts";
@@ -48,9 +50,16 @@ export interface ScenarioMeasurement {
  */
 export function buildScenarioResult(
   scenario: ResolvedScenario,
-  measurement: ScenarioMeasurement,
+  measurement: ScenarioMeasurement | readonly ScenarioMeasurement[],
 ): ScenarioResult {
-  const { results, passed } = evaluateExpect(scenario.expect, measurement);
+  const measurements = Array.isArray(measurement) ? measurement : [measurement];
+  if (measurements.length < 1) {
+    throw new RangeError("At least one scenario measurement is required.");
+  }
+  const aggregate = measurements.length === 1
+    ? measurements[0]
+    : aggregateMeasurements(measurements);
+  const { results, passed } = evaluateExpect(scenario.expect, aggregate);
   // A scenario that measured no requests must never pass: an empty sample set
   // makes every `expect` assertion vacuously true (and a missing-metric one
   // could only fail), so without this guard a run that sent nothing would
@@ -59,17 +68,21 @@ export function buildScenarioResult(
     name: scenario.name,
     type: scenario.type,
     load: loadSummary(scenario),
-    requests: measurement.requests,
-    throughputPerSec: measurement.throughputPerSec,
-    ...(measurement.deliveryThroughputPerSec == null ? {} : {
-      deliveryThroughputPerSec: measurement.deliveryThroughputPerSec,
+    requests: aggregate.requests,
+    throughputPerSec: aggregate.throughputPerSec,
+    ...(aggregate.deliveryThroughputPerSec == null ? {} : {
+      deliveryThroughputPerSec: aggregate.deliveryThroughputPerSec,
     }),
-    client: measurement.client,
-    server: measurement.server,
-    errors: measurement.errors,
+    client: aggregate.client,
+    server: aggregate.server,
+    errors: aggregate.errors,
     expectations: results,
-    passed: passed && measurement.requests.total > 0,
-    ...(measurement.histogram ? { histogram: measurement.histogram } : {}),
+    passed: passed && measurements.every((m) => m.requests.total > 0),
+    runCount: measurements.length,
+    ...(measurements.length > 1
+      ? { runs: measurements.map((m, index) => runResult(index + 1, m)) }
+      : {}),
+    ...(aggregate.histogram ? { histogram: aggregate.histogram } : {}),
   };
 }
 
@@ -92,7 +105,7 @@ export interface ReportInput {
 export function buildReport(input: ReportInput): BenchReport {
   return {
     $schema: REPORT_SCHEMA_ID,
-    schemaVersion: 2,
+    schemaVersion: 3,
     tool: { name: "@fedify/cli", version: metadata.version },
     environment: input.environment,
     target: input.target,
@@ -104,6 +117,193 @@ export function buildReport(input: ReportInput): BenchReport {
   };
 }
 
+function aggregateMeasurements(
+  measurements: readonly ScenarioMeasurement[],
+): ScenarioMeasurement {
+  const errors = sumErrorBuckets(measurements.flatMap((m) => m.errors));
+  const total = sum(measurements.map((m) => m.requests.total));
+  const ok = sum(measurements.map((m) => m.requests.ok));
+  const failed = sum(measurements.map((m) => m.requests.failed));
+  const delivery = medianPresent(
+    measurements.map((m) => m.deliveryThroughputPerSec),
+  );
+  return {
+    requests: {
+      total,
+      ok,
+      failed,
+      // Correctness gates are intentionally pessimistic in repeated runs:
+      // one bad run should not be hidden by two clean ones.
+      successRate: Math.min(...measurements.map((m) => m.requests.successRate)),
+    },
+    throughputPerSec: median(measurements.map((m) => m.throughputPerSec)),
+    ...(delivery == null ? {} : { deliveryThroughputPerSec: delivery }),
+    client: {
+      latencyMs: {
+        p50: median(measurements.map((m) => m.client.latencyMs.p50)),
+        p95: median(measurements.map((m) => m.client.latencyMs.p95)),
+        p99: median(measurements.map((m) => m.client.latencyMs.p99)),
+        mean: median(measurements.map((m) => m.client.latencyMs.mean)),
+        max: median(measurements.map((m) => m.client.latencyMs.max)),
+      },
+    },
+    server: aggregateServer(measurements.map((m) => m.server)),
+    errors,
+    ...aggregateHistogram(measurements),
+  };
+}
+
+function runResult(
+  run: number,
+  measurement: ScenarioMeasurement,
+): ScenarioRunResult {
+  return {
+    run,
+    requests: measurement.requests,
+    throughputPerSec: measurement.throughputPerSec,
+    ...(measurement.deliveryThroughputPerSec == null ? {} : {
+      deliveryThroughputPerSec: measurement.deliveryThroughputPerSec,
+    }),
+    client: measurement.client,
+    server: measurement.server,
+    errors: measurement.errors,
+    ...(measurement.histogram ? { histogram: measurement.histogram } : {}),
+  };
+}
+
+function aggregateServer(
+  servers: readonly (ServerMetrics | null)[],
+): ServerMetrics | null {
+  const present = servers.filter((s): s is ServerMetrics => s != null);
+  if (present.length < 1) return null;
+  const signature = aggregateSignatureVerification(present);
+  const queue = aggregateQueue(present);
+  return {
+    ...(signature == null ? {} : { signatureVerificationMs: signature }),
+    ...(queue == null ? {} : { queue }),
+  };
+}
+
+function aggregateSignatureVerification(
+  servers: readonly ServerMetrics[],
+): NonNullable<ServerMetrics["signatureVerificationMs"]> | null {
+  const values = servers
+    .map((s) => s.signatureVerificationMs)
+    .filter((s): s is NonNullable<ServerMetrics["signatureVerificationMs"]> =>
+      s != null
+    );
+  if (values.length < 1) return null;
+  const standards = new Set<string>();
+  for (const value of values) {
+    for (const key of Object.keys(value.byStandard ?? {})) standards.add(key);
+  }
+  const byStandard: Record<string, ReturnType<typeof aggregatePartial>> = {};
+  for (const standard of standards) {
+    byStandard[standard] = aggregatePartial(
+      values.map((v) => v.byStandard?.[standard]),
+    );
+  }
+  return {
+    overall: aggregatePartial(values.map((v) => v.overall)),
+    ...(Object.keys(byStandard).length < 1 ? {} : { byStandard }),
+  };
+}
+
+function aggregateQueue(
+  servers: readonly ServerMetrics[],
+): NonNullable<ServerMetrics["queue"]> | null {
+  const values = servers
+    .map((s) => s.queue)
+    .filter((q): q is NonNullable<ServerMetrics["queue"]> => q != null);
+  if (values.length < 1) return null;
+  const drainMs = aggregatePartial(values.map((v) => v.drainMs));
+  const depths = values.map((v) => v.depthMax).filter(isNumber);
+  return {
+    ...(hasPartial(drainMs) ? { drainMs } : {}),
+    ...(depths.length < 1 ? {} : { depthMax: Math.max(...depths) }),
+  };
+}
+
+type PartialMetric = {
+  readonly p50?: number;
+  readonly p95?: number;
+  readonly p99?: number;
+};
+
+function aggregatePartial(values: readonly (PartialMetric | undefined)[]) {
+  return {
+    ...partialField(values, "p50"),
+    ...partialField(values, "p95"),
+    ...partialField(values, "p99"),
+  };
+}
+
+function partialField(
+  values:
+    readonly ({ readonly [key: string]: number | undefined } | undefined)[],
+  key: "p50" | "p95" | "p99",
+): Record<typeof key, number> | Record<string, never> {
+  const fieldValues = values.map((v) => v?.[key]).filter(isNumber);
+  return fieldValues.length < 1
+    ? {}
+    : { [key]: median(fieldValues) } as Record<typeof key, number>;
+}
+
+function hasPartial(value: {
+  readonly p50?: number;
+  readonly p95?: number;
+  readonly p99?: number;
+}): boolean {
+  return value.p50 != null || value.p95 != null || value.p99 != null;
+}
+
+function aggregateHistogram(
+  measurements: readonly ScenarioMeasurement[],
+): { readonly histogram?: SerializedHistogram } {
+  const histograms = measurements.map((m) => m.histogram);
+  if (histograms.some((h) => h == null)) return {};
+  const [first, ...rest] = histograms as SerializedHistogram[];
+  const merged = LogLinearHistogram.fromJSON(first);
+  for (const histogram of rest) {
+    merged.merge(LogLinearHistogram.fromJSON(histogram));
+  }
+  return { histogram: merged.toJSON() };
+}
+
+function sumErrorBuckets(errors: readonly ErrorBucket[]): ErrorBucket[] {
+  const buckets = new Map<string, ErrorBucket>();
+  for (const error of errors) {
+    const key = `${error.kind}|${error.status ?? ""}|${error.reason}`;
+    const previous = buckets.get(key);
+    buckets.set(key, {
+      ...error,
+      count: (previous?.count ?? 0) + error.count,
+    });
+  }
+  return [...buckets.values()].sort((a, b) => b.count - a.count);
+}
+
+function medianPresent(values: readonly (number | undefined)[]): number | null {
+  const present = values.filter(isNumber);
+  return present.length < 1 ? null : median(present);
+}
+
+function median(values: readonly number[]): number {
+  if (values.length < 1) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const middle = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 1) return sorted[middle];
+  return (sorted[middle - 1] + sorted[middle]) / 2;
+}
+
+function sum(values: readonly number[]): number {
+  return values.reduce((a, b) => a + b, 0);
+}
+
+function isNumber(value: number | undefined): value is number {
+  return typeof value === "number" && Number.isFinite(value);
+}
+
 /** Detects the current runtime environment for reproducibility metadata. */
 export function detectEnvironment(): Environment {
   const g = globalThis as {
diff --git a/packages/cli/src/bench/result/model.ts b/packages/cli/src/bench/result/model.ts
index bbdf2d8bc..b02d90c03 100644
--- a/packages/cli/src/bench/result/model.ts
+++ b/packages/cli/src/bench/result/model.ts
@@ -149,15 +149,31 @@ export interface ScenarioResult {
   readonly errors: ErrorBucket[];
   readonly expectations: ExpectResult[];
   readonly passed: boolean;
+  /** The number of runs aggregated into this scenario result. */
+  readonly runCount: number;
+  /** Per-run measurements, present when a scenario was repeated. */
+  readonly runs?: ScenarioRunResult[];
   /** An optional serialized client latency histogram for re-aggregation. */
   readonly histogram?: SerializedHistogram;
 }
 
+/** The measured result of one repeated scenario run. */
+export interface ScenarioRunResult {
+  readonly run: number;
+  readonly requests: RequestSummary;
+  readonly throughputPerSec: number;
+  readonly deliveryThroughputPerSec?: number;
+  readonly client: ClientMetrics;
+  readonly server: ServerMetrics | null;
+  readonly errors: ErrorBucket[];
+  readonly histogram?: SerializedHistogram;
+}
+
 /** A complete benchmark report. */
 export interface BenchReport {
   /** The published report schema URL. */
   readonly $schema?: string;
-  readonly schemaVersion: 2;
+  readonly schemaVersion: 3;
   readonly tool: { readonly name: string; readonly version: string };
   readonly environment: Environment;
   readonly target: TargetInfo;
diff --git a/packages/cli/src/bench/result/schema.ts b/packages/cli/src/bench/result/schema.ts
index 787e4b63c..f35178dfc 100644
--- a/packages/cli/src/bench/result/schema.ts
+++ b/packages/cli/src/bench/result/schema.ts
@@ -10,6 +10,10 @@
 
 /** The hosted URL that serves the report schema. */
 export const REPORT_SCHEMA_ID =
+  "https://json-schema.fedify.dev/bench/report-v3.json";
+
+/** The hosted URL for the version 2 report schema. */
+export const REPORT_SCHEMA_V2_ID =
   "https://json-schema.fedify.dev/bench/report-v2.json";
 
 /** The hosted URL for the original report schema. */
@@ -292,7 +296,7 @@ export const reportSchemaV1 = {
 /** The benchmark report JSON Schema (draft 2020-12). */
 export const reportSchemaV2 = {
   ...reportSchemaV1,
-  $id: REPORT_SCHEMA_ID,
+  $id: REPORT_SCHEMA_V2_ID,
   properties: {
     ...reportSchemaV1.properties,
     schemaVersion: { const: 2 },
@@ -308,3 +312,59 @@ export const reportSchemaV2 = {
     },
   },
 } as const;
+
+/** The current benchmark report JSON Schema (draft 2020-12). */
+export const reportSchemaV3 = {
+  ...reportSchemaV2,
+  $id: REPORT_SCHEMA_ID,
+  properties: {
+    ...reportSchemaV2.properties,
+    schemaVersion: { const: 3 },
+  },
+  $defs: {
+    ...reportSchemaV2.$defs,
+    scenarioRunResult: {
+      type: "object",
+      additionalProperties: false,
+      required: [
+        "run",
+        "requests",
+        "throughputPerSec",
+        "client",
+        "server",
+        "errors",
+      ],
+      properties: {
+        run: { type: "integer", minimum: 1 },
+        requests: { $ref: "#/$defs/requestSummary" },
+        throughputPerSec: { type: "number" },
+        deliveryThroughputPerSec: { type: "number" },
+        client: { $ref: "#/$defs/clientMetrics" },
+        server: {
+          anyOf: [{ $ref: "#/$defs/serverMetrics" }, { type: "null" }],
+        },
+        errors: {
+          type: "array",
+          items: { $ref: "#/$defs/errorBucket" },
+        },
+        histogram: { $ref: "#/$defs/serializedHistogram" },
+      },
+    },
+    scenarioResult: {
+      ...reportSchemaV2.$defs.scenarioResult,
+      required: [
+        ...reportSchemaV2.$defs.scenarioResult.required,
+        "runCount",
+      ],
+      properties: {
+        ...reportSchemaV2.$defs.scenarioResult.properties,
+        runCount: { type: "integer", minimum: 1 },
+        runs: {
+          type: "array",
+          minItems: 2,
+          items: { $ref: "#/$defs/scenarioRunResult" },
+        },
+      },
+    },
+  },
+} as const;
diff --git a/packages/cli/src/bench/scenario/normalize.test.ts b/packages/cli/src/bench/scenario/normalize.test.ts
index 8456a264b..40d0e65a4 100644
--- a/packages/cli/src/bench/scenario/normalize.test.ts
+++ b/packages/cli/src/bench/scenario/normalize.test.ts
@@ -28,7 +28,7 @@ test("normalizeSuite - applies defaults and parses units", () => {
     maxInFlight: undefined,
   });
   assert.strictEqual(s.signing, "pipeline");
-  assert.strictEqual(s.runs, 1);
+  assert.strictEqual(s.runs, 3);
   assert.deepEqual(s.recipients, ["acct:alice@x"]);
 });
 
@@ -240,10 +240,20 @@ test("normalizeSuite - allows warmup shorter than duration", () => {
   assert.strictEqual(s.warmupMs, 9000);
 });
 
-test("normalizeSuite - rejects multiple runs (runs > 1)", () => {
-  assert.throws(
-    () => normalizeSuite(suite({ defaults: { runs: 3 } })),
-    (error: unknown) =>
-      error instanceof SuiteNormalizeError && /runs/.test(error.message),
-  );
+test("normalizeSuite - allows multiple runs", () => {
+  const s = normalizeSuite(suite({ defaults: { runs: 5 } })).scenarios[0];
+  assert.strictEqual(s.runs, 5);
+});
+
+test("normalizeSuite - scenario runs override defaults", () => {
+  const s = normalizeSuite(suite({
+    defaults: { runs: 5 },
+    scenarios: [{
+      name: "wf",
+      type: "webfinger",
+      recipient: "acct:a@x",
+      runs: 2,
+    }],
+  })).scenarios[0];
+  assert.strictEqual(s.runs, 2);
 });
diff --git a/packages/cli/src/bench/scenario/normalize.ts b/packages/cli/src/bench/scenario/normalize.ts
index ab0cbe1fa..56fe20a82 100644
--- a/packages/cli/src/bench/scenario/normalize.ts
+++ b/packages/cli/src/bench/scenario/normalize.ts
@@ -30,7 +30,7 @@ const DEFAULT_DURATION_MS = 60_000;
 const DEFAULT_WARMUP_MS = 0;
 const DEFAULT_RATE_PER_SEC = 50;
 const DEFAULT_SIGNING: SigningMode = "pipeline";
-const DEFAULT_RUNS = 1;
+const DEFAULT_RUNS = 3;
 
 /** The resolved load model for a scenario. */
 export type LoadModel =
@@ -170,12 +170,6 @@ function resolveScenario(scenario: Scenario, suite: Suite): ResolvedScenario {
     );
   }
   const runs = scenario.runs ?? defaults.runs ?? DEFAULT_RUNS;
-  if (runs > 1) {
-    throw new SuiteNormalizeError(
-      `Scenario "${scenario.name}": multiple runs (runs > 1) are not yet ` +
-        "implemented in fedify bench; set runs to 1.",
-    );
-  }
   return {
     name: scenario.name,
     type: scenario.type,
diff --git a/packages/cli/src/bench/schema.test.ts b/packages/cli/src/bench/schema.test.ts
index e3b33834f..c8a7f5ea3 100644
--- a/packages/cli/src/bench/schema.test.ts
+++ b/packages/cli/src/bench/schema.test.ts
@@ -69,6 +69,7 @@ const FIXTURE_GROUPS: readonly FixtureGroup[] = [
   { dir: "scenarios", schema: "scenario", valid: true },
   { dir: "invalid", schema: "scenario", valid: false },
   { dir: "reports", schema: "report", valid: true },
+  { dir: "compare-reports", schema: "compare-report", valid: true },
 ];
 
 function fixtureFiles(dir: string): string[] {
diff --git a/packages/cli/src/bench/schemas.ts b/packages/cli/src/bench/schemas.ts
index 82f9e279b..6574fb793 100644
--- a/packages/cli/src/bench/schemas.ts
+++ b/packages/cli/src/bench/schemas.ts
@@ -9,7 +9,12 @@
  * @module
  */
 
-import { reportSchemaV1, reportSchemaV2 } from "./result/schema.ts";
+import { compareReportSchemaV1 } from "./compare/schema.ts";
+import {
+  reportSchemaV1,
+  reportSchemaV2,
+  reportSchemaV3,
+} from "./result/schema.ts";
 import { scenarioSchemaV1, scenarioSchemaV2 } from "./scenario/schema.ts";
 
 /** A published JSON Schema and where it is hosted. */
@@ -36,6 +41,11 @@ export const PUBLISHED_SCHEMAS: readonly PublishedSchema[] = [
   },
   {
     name: "report",
+    fileName: "report-v3.json",
+    schema: reportSchemaV3 as unknown as Record<string, unknown>,
+  },
+  {
+    name: "report-v2",
     fileName: "report-v2.json",
     schema: reportSchemaV2 as unknown as Record<string, unknown>,
   },
@@ -44,4 +54,9 @@ export const PUBLISHED_SCHEMAS: readonly PublishedSchema[] = [
     fileName: "report-v1.json",
     schema: reportSchemaV1 as unknown as Record<string, unknown>,
   },
+  {
+    name: "compare-report",
+    fileName: "compare-report-v1.json",
+    schema: compareReportSchemaV1 as unknown as Record<string, unknown>,
+  },
 ];
diff --git a/schema/bench/compare-report-v1.json b/schema/bench/compare-report-v1.json
new file mode 100644
index 000000000..56a24efa6
--- /dev/null
+++ b/schema/bench/compare-report-v1.json
@@ -0,0 +1,231 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://json-schema.fedify.dev/bench/compare-report-v1.json",
+  "title": "Fedify benchmark comparison report",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "schemaVersion",
+    "tool",
+    "environment",
+    "startedAt",
+    "finishedAt",
+    "suite",
+    "maxRegression",
+    "base",
+    "head",
+    "comparisons",
+    "passed"
+  ],
+  "properties": {
+    "$schema": {
+      "type": "string"
+    },
+    "schemaVersion": {
+      "const": 1
+    },
+    "tool": {
+      "$ref": "#/$defs/tool"
+    },
+    "environment": {
+      "$ref": "#/$defs/environment"
+    },
+    "startedAt": {
+      "type": "string"
+    },
+    "finishedAt": {
+      "type": "string"
+    },
+    "suite": {
+      "$ref": "#/$defs/suite"
+    },
+    "maxRegression": {
+      "type": "number",
+      "minimum": 0
+    },
+    "base": {
+      "$ref": "#/$defs/compareSide"
+    },
+    "head": {
+      "$ref": "#/$defs/compareSide"
+    },
+    "comparisons": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/comparisonResult"
+      }
+    },
+    "passed": {
+      "type": "boolean"
+    }
+  },
+  "$defs": {
+    "tool": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "name",
+        "version"
+      ],
+      "properties": {
+        "name": {
+          "type": "string"
+        },
+        "version": {
+          "type": "string"
+        }
+      }
+    },
+    "environment": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "runtime",
+        "runtimeVersion",
+        "os",
+        "cpuCount"
+      ],
+      "properties": {
+        "runtime": {
+          "type": "string"
+        },
+        "runtimeVersion": {
+          "type": "string"
+        },
+        "os": {
+          "type": "string"
+        },
+        "cpuCount": {
+          "type": "integer",
+          "minimum": 0
+        }
+      }
+    },
+    "suite": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "configHash"
+      ],
+      "properties": {
+        "name": {
+          "type": "string"
+        },
+        "configHash": {
+          "type": "string"
+        }
+      }
+    },
+    "benchmarkReport": {
+      "type": "object",
+      "additionalProperties": true,
+      "required": [
+        "$schema",
+        "schemaVersion",
+        "tool",
+        "environment",
+        "target",
+        "suite",
+        "passed",
+        "scenarios"
+      ],
+      "properties": {
+        "$schema": {
+          "const": "https://json-schema.fedify.dev/bench/report-v3.json"
+        },
+        "schemaVersion": {
+          "const": 3
+        },
+        "tool": {
+          "$ref": "#/$defs/tool"
+        },
+        "environment": {
+          "$ref": "#/$defs/environment"
+        },
+        "suite": {
+          "$ref": "#/$defs/suite"
+        },
+        "passed": {
+          "type": "boolean"
+        },
+        "scenarios": {
+          "type": "array"
+        }
+      }
+    },
+    "compareSide": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "ref",
+        "report"
+      ],
+      "properties": {
+        "ref": {
+          "type": "string"
+        },
+        "report": {
+          "$ref": "#/$defs/benchmarkReport"
+        }
+      }
+    },
+    "comparisonResult": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "scenario",
+        "metric",
+        "direction",
+        "base",
+        "head",
+        "regression",
+        "noiseBand",
+        "allowedRegression",
+        "pass"
+      ],
+      "properties": {
+        "scenario": {
+          "type": "string"
+        },
+        "metric": {
+          "type": "string"
+        },
+        "direction": {
+          "enum": [
+            "lower-is-better",
+            "higher-is-better"
+          ]
+        },
+        "base": {
+          "type": [
+            "number",
+            "null"
+          ]
+        },
+        "head": {
+          "type": [
+            "number",
+            "null"
+          ]
+        },
+        "regression": {
+          "type": [
+            "number",
+            "null"
+          ]
+        },
+        "noiseBand": {
+          "type": "number",
+          "minimum": 0
+        },
+        "allowedRegression": {
+          "type": "number",
+          "minimum": 0
+        },
+        "pass": {
+          "type": "boolean"
+        }
+      }
+    }
+  }
+}
diff --git a/schema/bench/report-v3.json b/schema/bench/report-v3.json
new file mode 100644
index 000000000..ecb644cbc
--- /dev/null
+++ b/schema/bench/report-v3.json
@@ -0,0 +1,584 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://json-schema.fedify.dev/bench/report-v3.json",
+  "title": "Fedify benchmark report",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "schemaVersion",
+    "tool",
+    "environment",
+    "target",
+    "startedAt",
+    "finishedAt",
+    "suite",
+    "passed",
+    "scenarios"
+  ],
+  "properties": {
+    "$schema": {
+      "type": "string"
+    },
+    "schemaVersion": {
+      "const": 3
+    },
+    "tool": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "name",
+        "version"
+      ],
+      "properties": {
+        "name": {
+          "type": "string"
+        },
+        "version": {
+          "type": "string"
+        }
+      }
+    },
+    "environment": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "runtime",
+        "runtimeVersion",
+        "os",
+        "cpuCount"
+      ],
+      "properties": {
+        "runtime": {
+          "type": "string"
+        },
+        "runtimeVersion": {
+          "type": "string"
+        },
+        "os": {
+          "type": "string"
+        },
+        "cpuCount": {
+          "type": "integer",
+          "minimum": 0
+        }
+      }
+    },
+    "target": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "url",
+        "statsAvailable"
+      ],
+      "properties": {
+        "url": {
+          "type": "string"
+        },
+        "fedifyVersion": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "statsAvailable": {
+          "type": "boolean"
+        }
+      }
+    },
+    "startedAt": {
+      "type": "string"
+    },
+    "finishedAt": {
+      "type": "string"
+    },
+    "suite": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "configHash"
+      ],
+      "properties": {
+        "name": {
+          "type": "string"
+        },
+        "configHash": {
+          "type": "string"
+        }
+      }
+    },
+    "passed": {
+      "type": "boolean"
+    },
+    "scenarios": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/scenarioResult"
+      }
+    }
+  },
+  "$defs": {
+    "latencyMs": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "p50",
+        "p95",
+        "p99",
+        "mean",
+        "max"
+      ],
+      "properties": {
+        "p50": {
+          "type": "number"
+        },
+        "p95": {
+          "type": "number"
+        },
+        "p99": {
+          "type": "number"
+        },
+        "mean": {
+          "type": "number"
+        },
+        "max": {
+          "type": "number"
+        }
+      }
+    },
+    "partialLatencyMs": {
+      "type": "object",
+      "additionalProperties": false,
+      "properties": {
+        "p50": {
+          "type": "number"
+        },
+        "p95": {
+          "type": "number"
+        },
+        "p99": {
+          "type": "number"
+        }
+      }
+    },
+    "loadSummary": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "model",
+        "durationMs",
+        "warmupMs"
+      ],
+      "properties": {
+        "model": {
+          "enum": [
+            "open",
+            "closed"
+          ]
+        },
+        "ratePerSec": {
+          "type": "number"
+        },
+        "arrival": {
+          "type": "string"
+        },
+        "concurrency": {
+          "type": "integer"
+        },
+        "durationMs": {
+          "type": "number"
+        },
+        "warmupMs": {
+          "type": "number"
+        },
+        "maxInFlight": {
+          "type": "integer"
+        }
+      },
+      "oneOf": [
+        {
+          "properties": {
+            "model": {
+              "const": "open"
+            }
+          },
+          "required": [
+            "ratePerSec",
+            "arrival"
+          ],
+          "not": {
+            "required": [
+              "concurrency"
+            ]
+          }
+        },
+        {
+          "properties": {
+            "model": {
+              "const": "closed"
+            }
+          },
+          "required": [
+            "concurrency"
+          ],
+          "not": {
+            "anyOf": [
+              {
+                "required": [
+                  "ratePerSec"
+                ]
+              },
+              {
+                "required": [
+                  "arrival"
+                ]
+              }
+            ]
+          }
+        }
+      ]
+    },
+    "requestSummary": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "total",
+        "ok",
+        "failed",
+        "successRate"
+      ],
+      "properties": {
+        "total": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "ok": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "failed": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "successRate": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1
+        }
+      }
+    },
+    "clientMetrics": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "latencyMs"
+      ],
+      "properties": {
+        "latencyMs": {
+          "$ref": "#/$defs/latencyMs"
+        }
+      }
+    },
+    "serverMetrics": {
+      "type": "object",
+      "additionalProperties": false,
+      "properties": {
+        "signatureVerificationMs": {
+          "type": "object",
+          "additionalProperties": false,
+          "required": [
+            "overall"
+          ],
+          "properties": {
+            "overall": {
+              "$ref": "#/$defs/partialLatencyMs"
+            },
+            "byStandard": {
+              "type": "object",
+              "additionalProperties": {
+                "$ref": "#/$defs/partialLatencyMs"
+              }
+            }
+          }
+        },
+        "queue": {
+          "type": "object",
+          "additionalProperties": false,
+          "properties": {
+            "drainMs": {
+              "$ref": "#/$defs/partialLatencyMs"
+            },
+            "depthMax": {
+              "type": "number"
+            }
+          }
+        }
+      }
+    },
+    "errorBucket": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "kind",
+        "reason",
+        "count"
+      ],
+      "properties": {
+        "kind": {
+          "type": "string"
+        },
+        "status": {
+          "type": "integer"
+        },
+        "reason": {
+          "type": "string"
+        },
+        "count": {
+          "type": "integer",
+          "minimum": 0
+        }
+      }
+    },
+    "expectResult": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "metric",
+        "op",
+        "threshold",
+        "unit",
+        "actual",
+        "severity",
+        "pass"
+      ],
+      "properties": {
+        "metric": {
+          "type": "string"
+        },
+        "op": {
+          "enum": [
+            "lt",
+            "lte",
+            "gt",
+            "gte",
+            "eq"
+          ]
+        },
+        "threshold": {
+          "type": "number"
+        },
+        "unit": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "actual": {
+          "type": [
+            "number",
+            "null"
+          ]
+        },
+        "severity": {
+          "enum": [
+            "warn",
+            "fail"
+          ]
+        },
+        "pass": {
+          "type": "boolean"
+        }
+      }
+    },
+    "scenarioResult": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "name",
+        "type",
+        "load",
+        "requests",
+        "throughputPerSec",
+        "client",
+        "server",
+        "errors",
+        "expectations",
+        "passed",
+        "runCount"
+      ],
+      "properties": {
+        "name": {
+          "type": "string"
+        },
+        "type": {
+          "enum": [
+            "inbox",
+            "webfinger",
+            "actor",
+            "object",
+            "fanout",
+            "collection",
+            "failure",
+            "mixed"
+          ]
+        },
+        "load": {
+          "$ref": "#/$defs/loadSummary"
+        },
+        "requests": {
+          "$ref": "#/$defs/requestSummary"
+        },
+        "throughputPerSec": {
+          "type": "number"
+        },
+        "client": {
+          "$ref": "#/$defs/clientMetrics"
+        },
+        "server": {
+          "anyOf": [
+            {
+              "$ref": "#/$defs/serverMetrics"
+            },
+            {
+              "type": "null"
+            }
+          ]
+        },
+        "errors": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/errorBucket"
+          }
+        },
+        "expectations": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/expectResult"
+          }
+        },
+        "passed": {
+          "type": "boolean"
+        },
+        "histogram": {
+          "$ref": "#/$defs/serializedHistogram"
+        },
+        "deliveryThroughputPerSec": {
+          "type": "number"
+        },
+        "runCount": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "runs": {
+          "type": "array",
+          "minItems": 2,
+          "items": {
+            "$ref": "#/$defs/scenarioRunResult"
+          }
+        }
+      }
+    },
+    "serializedHistogram": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "version",
+        "subBucketCount",
+        "count",
+        "zeroCount",
+        "min",
+        "max",
+        "sum",
+        "indices",
+        "counts"
+      ],
+      "properties": {
+        "version": {
+          "const": 1
+        },
+        "subBucketCount": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "count": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "zeroCount": {
+          "type": "integer",
+          "minimum": 0
+        },
+        "min": {
+          "type": "number"
+        },
+        "max": {
+          "type": "number"
+        },
+        "sum": {
+          "type": "number"
+        },
+        "indices": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          }
+        },
+        "counts": {
+          "type": "array",
+          "items": {
+            "type": "integer",
+            "minimum": 0
+          }
+        }
+      }
+    },
+    "scenarioRunResult": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "run",
+        "requests",
+        "throughputPerSec",
+        "client",
+        "server",
+        "errors"
+      ],
+      "properties": {
+        "run": {
+          "type": "integer",
+          "minimum": 1
+        },
+        "requests": {
+          "$ref": "#/$defs/requestSummary"
+        },
+        "throughputPerSec": {
+          "type": "number"
+        },
+        "deliveryThroughputPerSec": {
+          "type": "number"
+        },
+        "client": {
+          "$ref": "#/$defs/clientMetrics"
+        },
+        "server": {
+          "anyOf": [
+            {
+              "$ref": "#/$defs/serverMetrics"
+            },
+            {
+              "type": "null"
+            }
+          ]
+        },
+        "errors": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/errorBucket"
+          }
+        },
+        "histogram": {
+          "$ref": "#/$defs/serializedHistogram"
+        }
+      }
+    }
+  }
+}

From 55a91d4964f407901930a5297961428e9d822c80 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 11:37:15 +0900
Subject: [PATCH 02/19] Document benchmark comparison gates

Explain repeated benchmark runs, the fedify bench compare workflow, and the
CI versus controlled-runner profile split.  Record the CLI changelog entry
for the comparison command and the report schema updates.

https://github.com/fedify-dev/fedify/issues/744
https://github.com/fedify-dev/fedify/issues/786

Assisted-by: Codex:gpt-5.5
---
 CHANGES.md                  | 10 ++++
 docs/manual/benchmarking.md | 99 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 04d9bc57b..29cdb31ef 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -315,9 +315,19 @@ To be released.
     that keep `triggerSinks` allowlisting enabled.  This change is published
     as benchmark scenario schema version 2.  [[#744], [#785], [#801], [#802]]
 
+ -  Added `fedify bench compare` for CI-friendly performance regression gates.
+    The command checks out base and head refs into temporary worktrees, starts
+    the benchmark target for each ref, runs the same suite, and fails when the
+    head regresses beyond `--max-regression` plus the measured per-run noise
+    band.  Benchmark scenarios now run three times by default and aggregate
+    repeated runs with median latency/throughput and pessimistic correctness
+    results.  This change is published as benchmark report schema version 3
+    and comparison report schema version 1.  [[#744], [#786]]
+
 [#783]: https://github.com/fedify-dev/fedify/issues/783
 [#784]: https://github.com/fedify-dev/fedify/issues/784
 [#785]: https://github.com/fedify-dev/fedify/issues/785
+[#786]: https://github.com/fedify-dev/fedify/issues/786
 [#801]: https://github.com/fedify-dev/fedify/pull/801
 [#802]: https://github.com/fedify-dev/fedify/pull/802
 
diff --git a/docs/manual/benchmarking.md b/docs/manual/benchmarking.md
index d20b94e0f..f792f9717 100644
--- a/docs/manual/benchmarking.md
+++ b/docs/manual/benchmarking.md
@@ -100,7 +100,6 @@ crypto cost is real.
 > types, a few options the format accepts are also not implemented yet and are
 > rejected up front with a clear message:
 >
->  -  `runs` greater than `1` (repeated runs).
 >  -  An `inbox` `activity` that is not a `Create` carrying an embedded `Note`;
 >     that is, a non-`Create` `type`, a non-`Note` `object.type`, or
 >     `embedObject: false`.
@@ -262,6 +261,29 @@ Signing is kept off the send critical path, set per scenario with `signing`:
     (open-loop only; Poisson arrivals may still sign a few extra during the
     run).
 
+### Repeated runs
+
+Each scenario runs three times by default.  Set `runs` in `defaults` to change
+the whole suite, or set `runs` on one scenario to override the default for that
+scenario:
+
+~~~~ yaml
+defaults:
+  runs: 5
+scenarios:
+- name: ci-smoke
+  type: webfinger
+  runs: 1
+  recipient: acct:alice@localhost
+~~~~
+
+Repeated runs are aggregated for stable CI gates.  Latency and throughput
+metrics use the median run, request totals and error buckets are summed, queue
+depth uses the worst observed maximum, and `successRate` uses the worst run so
+one bad run is not hidden by clean neighbors.  The JSON report records
+`runCount` for every scenario and includes per-run measurements in `runs` when
+the scenario ran more than once.
+
 ### Output
 
 Choose the format with `--format text` (default), `json`, or `markdown`;
@@ -288,7 +310,80 @@ CI check.  Keep CI gates on robust signals such as success rate, error counts,
 and gross throughput or latency floors; precise latency-percentile regression
 belongs in a controlled environment, not a shared CI runner.
 
-[report schema]: https://json-schema.fedify.dev/bench/report-v2.json
+[report schema]: https://json-schema.fedify.dev/bench/report-v3.json
+
+### Comparing two revisions
+
+Use `fedify bench compare` when a CI job should compare a change against a base
+revision on the same runner instead of relying on an absolute threshold:
+
+~~~~ sh
+fedify bench compare \
+  --base origin/main \
+  --head HEAD \
+  --file scenario.yaml \
+  --start-command "pnpm dev" \
+  --ready-url http://127.0.0.1:3000/health \
+  --max-regression 15%
+~~~~
+
+The command creates temporary detached worktrees for the base and head refs,
+starts the target command inside each worktree, waits for `--ready-url`, then
+runs the same suite from the current checkout against that target.  The two
+targets run sequentially, so they can use the same port.  Dependencies are not
+installed automatically; either prepare both refs in the job before comparing
+or make `--start-command` perform the needed build/start steps.
+
+If `--target` is omitted, the benchmark target defaults to the origin of
+`--ready-url`.  Pass `--target` when readiness and benchmark traffic use
+different URLs.  The comparison report can be written as text, JSON, or
+Markdown with the same `--format` and `--output` options; JSON validates
+against the [comparison report schema].
+
+`--max-regression` accepts either a ratio such as `0.15` or a percentage such
+as `15%`.  For each scenario, `fedify bench compare` compares performance
+metrics from the scenario's `expect` block when they are latency or rate
+metrics; if no such metric is present, it compares `latency.p95` and
+`throughputPerSec`.  A head result passes when the measured regression is
+within `--max-regression` plus the observed per-run noise band.  The command
+exits with status 1 when the head run fails its own `expect` gate or a
+comparison exceeds that allowance; configuration and orchestration failures
+exit with status 2.
+
+Use short, broad suites in shared CI:
+
+~~~~ yaml
+defaults:
+  runs: 3
+  duration: 20s
+  warmup: 5s
+scenarios:
+- name: inbox-ci
+  type: inbox
+  # ...
+  expect:
+    successRate: ">= 99%"
+    latency.p95: "< 500ms"
+~~~~
+
+Use a controlled performance runner for narrower regression checks:
+
+~~~~ yaml
+defaults:
+  runs: 7
+  duration: 2m
+  warmup: 20s
+scenarios:
+- name: inbox-lab
+  type: inbox
+  # ...
+  expect:
+    successRate: ">= 99.9%"
+    latency.p95: "< 120ms"
+    throughputPerSec: "> 250/s"
+~~~~
+
+[comparison report schema]: https://json-schema.fedify.dev/bench/compare-report-v1.json
 
 ### Safety
 

From a4b2a5ad3ff0ef8a9f4da25b79857947407ad5e3 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 12:02:09 +0900
Subject: [PATCH 03/19] Harden benchmark comparison runtime

Keep benchmark target output away from stdout, enforce readiness timeouts,
terminate Windows target process trees, and avoid infinite noise allowances in
comparison reports.

https://github.com/fedify-dev/fedify/issues/744
https://github.com/fedify-dev/fedify/issues/786

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 152 ++++++++++++++++++++++++
 packages/cli/src/bench/compare.ts      | 153 ++++++++++++++++++++++---
 2 files changed, 289 insertions(+), 16 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index 428373aa6..e30eb0c22 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -1,13 +1,27 @@
 import assert from "node:assert/strict";
+import { Buffer } from "node:buffer";
+import type { ChildProcess, SpawnOptions } from "node:child_process";
+import { EventEmitter } from "node:events";
 import test from "node:test";
 import type { BenchCompareCommand } from "./command.ts";
 import {
   buildCompareReport,
   parseRegressionTolerance,
   runBenchCompare,
+  startBenchmarkTarget,
+  stopTargetProcess,
+  waitReadyUrl,
 } from "./compare.ts";
 import type { BenchReport, ScenarioResult } from "./result/model.ts";
 
+type FakeChildProcess = ChildProcess & {
+  readonly stdout: EventEmitter;
+  readonly stderr: EventEmitter;
+  readonly exitCode: number | null;
+  readonly signalCode: NodeJS.Signals | null;
+  kill(signal?: NodeJS.Signals | number): boolean;
+};
+
 function scenario(
   overrides: Partial<ScenarioResult> & { name?: string } = {},
 ): ScenarioResult {
@@ -108,6 +122,22 @@ function command(overrides: Partial<BenchCompareCommand>): BenchCompareCommand {
   };
 }
 
+function fakeChildProcess(pid = 1234): FakeChildProcess {
+  const child = new EventEmitter() as FakeChildProcess;
+  Object.defineProperties(child, {
+    pid: { value: pid, configurable: true },
+    stdout: { value: new EventEmitter(), configurable: true },
+    stderr: { value: new EventEmitter(), configurable: true },
+    exitCode: { value: null, configurable: true },
+    signalCode: { value: null, configurable: true },
+  });
+  child.kill = (signal?: NodeJS.Signals | number) => {
+    child.emit("exit", null, signal);
+    return true;
+  };
+  return child;
+}
+
 test("parseRegressionTolerance - parses percentages", () => {
   assert.strictEqual(parseRegressionTolerance("15%"), 0.15);
   assert.strictEqual(parseRegressionTolerance("0.2"), 0.2);
@@ -175,6 +205,128 @@ test("buildCompareReport - fails regressions outside tolerance and noise", () =>
   assert.strictEqual(compare.passed, false);
 });
 
+test("buildCompareReport - keeps zero-median noise finite", () => {
+  const base = report([
+    scenario({
+      client: {
+        latencyMs: { p50: 0, p95: 100, p99: 100, mean: 50, max: 100 },
+      },
+      runs: [
+        runResult(0, 100),
+        runResult(0, 100),
+        runResult(100, 100),
+      ],
+    }),
+  ]);
+  const head = report([
+    scenario({
+      client: {
+        latencyMs: { p50: 0, p95: 120, p99: 120, mean: 60, max: 120 },
+      },
+      runs: [
+        runResult(0, 100),
+        runResult(0, 100),
+        runResult(120, 100),
+      ],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0.1,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  const latency = compare.comparisons.find((c) => c.metric === "latency.p95");
+  assert.ok(latency);
+  assert.strictEqual(latency.noiseBand, 0);
+  assert.strictEqual(latency.allowedRegression, 0.1);
+  assert.strictEqual(latency.pass, false);
+  assert.strictEqual(
+    JSON.parse(JSON.stringify(compare)).comparisons[0].noiseBand,
+    0,
+  );
+});
+
+test("startBenchmarkTarget - keeps target stdout off stdout", async () => {
+  let options: SpawnOptions | undefined;
+  const child = fakeChildProcess();
+  let stderr = "";
+  const target = startBenchmarkTarget("/tmp/base", "pnpm dev", {
+    platform: "linux",
+    stderr: {
+      write: (chunk) => {
+        stderr += Buffer.isBuffer(chunk) ? chunk.toString("utf-8") : chunk;
+        return true;
+      },
+    },
+    spawn: (command, spawnOptions) => {
+      assert.strictEqual(command, "pnpm dev");
+      options = spawnOptions;
+      return child;
+    },
+  });
+  assert.deepEqual(options?.stdio, ["ignore", "pipe", "pipe"]);
+  child.stdout.emit("data", Buffer.from("stdout log\n"));
+  child.stderr.emit("data", "stderr log\n");
+  assert.strictEqual(stderr, "stdout log\nstderr log\n");
+  await target.stop();
+});
+
+test("stopTargetProcess - kills the Windows process tree", async () => {
+  const child = fakeChildProcess(4321);
+  const kills: Array<[number, NodeJS.Signals]> = [];
+  await stopTargetProcess(child, {
+    platform: "win32",
+    killWindowsProcessTree: (pid, signal) => {
+      kills.push([pid, signal]);
+      child.emit("exit", null, signal);
+    },
+  });
+  assert.deepEqual(kills, [[4321, "SIGTERM"]]);
+});
+
+test("waitReadyUrl - does not wait for streaming response bodies", async () => {
+  let calls = 0;
+  await waitReadyUrl(new URL("http://ready.test/health"), 100, {
+    fetch: () => {
+      calls++;
+      return Promise.resolve(
+        new Response(
+          new ReadableStream({
+            start(controller) {
+              controller.enqueue(new Uint8Array([1]));
+            },
+          }),
+          { status: 200 },
+        ),
+      );
+    },
+  });
+  assert.strictEqual(calls, 1);
+});
+
+test("waitReadyUrl - aborts a hanging fetch at the timeout", async () => {
+  const startedAt = Date.now();
+  await assert.rejects(
+    waitReadyUrl(new URL("http://ready.test/health"), 20, {
+      fetch: (_input, init) =>
+        new Promise<Response>((_resolve, reject) => {
+          init?.signal?.addEventListener(
+            "abort",
+            () => reject(new Error("aborted")),
+            { once: true },
+          );
+        }),
+      sleep: () => Promise.resolve(),
+    }),
+    /Timed out waiting/,
+  );
+  assert.ok(Date.now() - startedAt < 1000);
+});
+
 test("runBenchCompare - orchestrates worktrees and cleans up", async () => {
   const events: string[] = [];
   let code = -1;
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index da5390296..b13af0e1c 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -1,4 +1,8 @@
-import { type ChildProcess, spawn } from "node:child_process";
+import {
+  type ChildProcess,
+  spawn,
+  type SpawnOptions,
+} from "node:child_process";
 import { mkdtemp, rm } from "node:fs/promises";
 import { writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
@@ -88,6 +92,39 @@ export interface RunBenchInWorktreeInput {
   readonly target: string;
 }
 
+type ProcessOutput = {
+  write(chunk: string | Uint8Array): unknown;
+};
+
+type SpawnTarget = (
+  command: string,
+  options: SpawnOptions,
+) => ChildProcess;
+
+/** Options for starting a benchmark target. */
+export interface StartBenchmarkTargetOptions {
+  readonly platform?: NodeJS.Platform;
+  readonly spawn?: SpawnTarget;
+  readonly stderr?: ProcessOutput;
+}
+
+/** Options for stopping a benchmark target process. */
+export interface StopTargetProcessOptions {
+  readonly platform?: NodeJS.Platform;
+  readonly killWindowsProcessTree?: (
+    pid: number,
+    signal: NodeJS.Signals,
+  ) => void;
+  readonly killProcessGroup?: (pid: number, signal: NodeJS.Signals) => void;
+  readonly forceTimeoutMs?: number;
+}
+
+/** Dependencies for waiting until a benchmark target is ready. */
+export interface WaitReadyUrlDeps {
+  readonly fetch?: typeof fetch;
+  readonly sleep?: (ms: number) => Promise<void>;
+}
+
 /** Runs `fedify bench compare`. */
 export async function runBenchCompare(
   command: BenchCompareCommand,
@@ -392,7 +429,7 @@ function relativeNoise(scenario: ScenarioResult, metric: string): number {
   if (values.length < 2) return 0;
   const medianValue = median(values);
   if (medianValue <= 0) {
-    return Math.max(...values) === Math.min(...values) ? 0 : Infinity;
+    return 0;
   }
   return (Math.max(...values) - Math.min(...values)) / (2 * medianValue);
 }
@@ -548,63 +585,147 @@ function defaultStartTarget(
   cwd: string,
   startCommand: string,
 ): Promise<StartedTarget> {
-  const child = spawn(startCommand, {
+  return Promise.resolve(startBenchmarkTarget(cwd, startCommand));
+}
+
+/** Starts a benchmark target process. */
+export function startBenchmarkTarget(
+  cwd: string,
+  startCommand: string,
+  options: StartBenchmarkTargetOptions = {},
+): StartedTarget {
+  const platform = options.platform ?? process.platform;
+  const spawnTarget = options.spawn ?? spawn;
+  const stderr = options.stderr ?? process.stderr;
+  const child = spawnTarget(startCommand, {
     cwd,
-    detached: process.platform !== "win32",
+    detached: platform !== "win32",
     shell: true,
-    stdio: "inherit",
+    stdio: ["ignore", "pipe", "pipe"],
     env: process.env,
   });
-  return Promise.resolve({
-    stop: () => stopProcess(child),
+  forwardTargetOutput(child, stderr);
+  return { stop: () => stopTargetProcess(child, { platform }) };
+}
+
+function forwardTargetOutput(child: ChildProcess, stderr: ProcessOutput): void {
+  child.stdout?.on("data", (chunk: string | Uint8Array) => {
+    stderr.write(chunk);
+  });
+  child.stderr?.on("data", (chunk: string | Uint8Array) => {
+    stderr.write(chunk);
   });
 }
 
-function stopProcess(child: ChildProcess): Promise<void> {
+/** Stops a benchmark target process. */
+export function stopTargetProcess(
+  child: ChildProcess,
+  options: StopTargetProcessOptions = {},
+): Promise<void> {
+  const platform = options.platform ?? process.platform;
+  const killWindowsProcessTree = options.killWindowsProcessTree ??
+    defaultKillWindowsProcessTree;
+  const killProcessGroup = options.killProcessGroup ??
+    ((pid, signal) => process.kill(pid, signal));
+  const forceTimeoutMs = options.forceTimeoutMs ?? 5000;
   return new Promise((resolve) => {
     if (child.exitCode != null || child.signalCode != null) {
       resolve();
       return;
     }
     const timer = setTimeout(() => {
-      killTargetProcess(child, "SIGKILL");
-    }, 5000);
+      killTargetProcess(child, "SIGKILL", {
+        platform,
+        killWindowsProcessTree,
+        killProcessGroup,
+      });
+    }, forceTimeoutMs);
     child.once("exit", () => {
       clearTimeout(timer);
       resolve();
     });
-    killTargetProcess(child, "SIGTERM");
+    killTargetProcess(child, "SIGTERM", {
+      platform,
+      killWindowsProcessTree,
+      killProcessGroup,
+    });
   });
 }
 
+interface KillTargetProcessOptions {
+  readonly platform: NodeJS.Platform;
+  readonly killWindowsProcessTree: (
+    pid: number,
+    signal: NodeJS.Signals,
+  ) => void;
+  readonly killProcessGroup: (pid: number, signal: NodeJS.Signals) => void;
+}
+
 function killTargetProcess(
   child: ChildProcess,
   signal: NodeJS.Signals,
+  options: KillTargetProcessOptions,
 ): void {
-  if (child.pid == null || process.platform === "win32") {
+  if (child.pid == null) {
     child.kill(signal);
     return;
   }
+  if (options.platform === "win32") {
+    options.killWindowsProcessTree(child.pid, signal);
+    return;
+  }
   try {
-    process.kill(-child.pid, signal);
+    options.killProcessGroup(-child.pid, signal);
   } catch {
     child.kill(signal);
   }
 }
 
+function defaultKillWindowsProcessTree(
+  pid: number,
+  _signal: NodeJS.Signals,
+): void {
+  const child = spawn("taskkill", ["/pid", String(pid), "/T", "/F"], {
+    stdio: "ignore",
+    windowsHide: true,
+  });
+  child.on("error", () => {});
+}
+
 async function defaultWaitReady(url: URL, timeoutMs: number): Promise<void> {
+  return await waitReadyUrl(url, timeoutMs);
+}
+
+/** Waits until a benchmark target readiness URL responds successfully. */
+export async function waitReadyUrl(
+  url: URL,
+  timeoutMs: number,
+  deps: WaitReadyUrlDeps = {},
+): Promise<void> {
+  const fetchReady = deps.fetch ?? fetch;
+  const sleep = deps.sleep ??
+    ((ms) => new Promise<void>((resolve) => setTimeout(resolve, ms)));
   const deadline = Date.now() + timeoutMs;
   let lastError: unknown;
   while (Date.now() <= deadline) {
+    const remainingMs = deadline - Date.now();
+    if (remainingMs <= 0) break;
+    const controller = new AbortController();
+    const timer = setTimeout(() => {
+      controller.abort(new Error(`ready URL timed out after ${timeoutMs}ms`));
+    }, remainingMs);
     try {
-      const response = await fetch(url);
-      await response.arrayBuffer().catch(() => {});
+      const response = await fetchReady(url, { signal: controller.signal });
+      void response.body?.cancel().catch(() => {});
       if (response.status >= 200 && response.status < 400) return;
       lastError = new Error(`ready URL returned ${response.status}`);
     } catch (error) {
       lastError = error;
+    } finally {
+      clearTimeout(timer);
     }
-    await new Promise((resolve) => setTimeout(resolve, 250));
+    const delayMs = Math.min(250, deadline - Date.now());
+    if (delayMs > 0) await sleep(delayMs);
   }
   throw new Error(
     `Timed out waiting for ${url.href}: ${describeError(lastError)}.`,

From 675e939698a9177ff8bed29c2b537c23f48bece7 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 12:33:54 +0900
Subject: [PATCH 04/19] Preserve missing repeated server metrics

Treat missing server-side measurements in any repeated benchmark run as missing
from the aggregate so signature and queue expectations cannot pass on a partial
subset of runs.

https://github.com/fedify-dev/fedify/issues/744
https://github.com/fedify-dev/fedify/issues/786

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/result/build.test.ts | 28 +++++++++++++++++++++
 packages/cli/src/bench/result/build.ts      | 18 ++++++-------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/packages/cli/src/bench/result/build.test.ts b/packages/cli/src/bench/result/build.test.ts
index 3ed1d2ec3..ed5d7fdfe 100644
--- a/packages/cli/src/bench/result/build.test.ts
+++ b/packages/cli/src/bench/result/build.test.ts
@@ -129,6 +129,34 @@ test("buildScenarioResult - aggregates repeated runs for CI gates", () => {
   assert.strictEqual(result.passed, false);
 });
 
+test("buildScenarioResult - fails repeated server gates with missing stats", () => {
+  const scenario = normalizeSuite({
+    version: 1,
+    target: "http://localhost:3000",
+    defaults: {
+      load: { concurrency: 50 },
+      duration: "60s",
+      warmup: "10s",
+      runs: 3,
+    },
+    scenarios: [{
+      name: "inbox-shared",
+      type: "inbox",
+      recipient: "acct:a@x",
+      expect: { "signatureVerification.p95": "< 20ms" },
+    }],
+  }).scenarios[0];
+  const result = buildScenarioResult(scenario, [
+    measurement(),
+    { ...measurement(), server: null },
+    measurement(),
+  ]);
+  assert.strictEqual(result.server, null);
+  assert.strictEqual(result.expectations[0].actual, null);
+  assert.strictEqual(result.expectations[0].pass, false);
+  assert.strictEqual(result.passed, false);
+});
+
 test("buildReport - gate passes only when all scenarios pass", () => {
   const ok = buildScenarioResult(resolvedInbox(), measurement());
   const bad = buildScenarioResult(resolvedInbox(), {
diff --git a/packages/cli/src/bench/result/build.ts b/packages/cli/src/bench/result/build.ts
index ef4ffe394..d3ba4bd8a 100644
--- a/packages/cli/src/bench/result/build.ts
+++ b/packages/cli/src/bench/result/build.ts
@@ -175,7 +175,7 @@ function aggregateServer(
   servers: readonly (ServerMetrics | null)[],
 ): ServerMetrics | null {
   const present = servers.filter((s): s is ServerMetrics => s != null);
-  if (present.length < 1) return null;
+  if (present.length !== servers.length) return null;
   const signature = aggregateSignatureVerification(present);
   const queue = aggregateQueue(present);
   return {
@@ -192,7 +192,7 @@ function aggregateSignatureVerification(
     .filter((s): s is NonNullable<ServerMetrics["signatureVerificationMs"]> =>
       s != null
     );
-  if (values.length < 1) return null;
+  if (values.length !== servers.length) return null;
   const standards = new Set<string>();
   for (const value of values) {
     for (const key of Object.keys(value.byStandard ?? {})) standards.add(key);
@@ -215,12 +215,12 @@ function aggregateQueue(
   const values = servers
     .map((s) => s.queue)
     .filter((q): q is NonNullable<ServerMetrics["queue"]> => q != null);
-  if (values.length < 1) return null;
+  if (values.length !== servers.length) return null;
   const drainMs = aggregatePartial(values.map((v) => v.drainMs));
-  const depths = values.map((v) => v.depthMax).filter(isNumber);
+  const depths = values.map((v) => v.depthMax);
   return {
     ...(hasPartial(drainMs) ? { drainMs } : {}),
-    ...(depths.length < 1 ? {} : { depthMax: Math.max(...depths) }),
+    ...(depths.every(isNumber) ? { depthMax: Math.max(...depths) } : {}),
   };
 }
 
@@ -243,10 +243,10 @@ function partialField(
     readonly ({ readonly [key: string]: number | undefined } | undefined)[],
   key: "p50" | "p95" | "p99",
 ): Record<typeof key, number> | Record<string, never> {
-  const fieldValues = values.map((v) => v?.[key]).filter(isNumber);
-  return fieldValues.length < 1
-    ? {}
-    : { [key]: median(fieldValues) } as Record<typeof key, number>;
+  const fieldValues = values.map((v) => v?.[key]);
+  return fieldValues.every(isNumber)
+    ? { [key]: median(fieldValues) } as Record<typeof key, number>
+    : {};
 }
 
 function hasPartial(value: {

From 711d42a892b63db90b66a327ce129cef6612abfe Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 14:15:59 +0900
Subject: [PATCH 05/19] Correct benchmark dry-run comparisons

Show repeated run counts, total duration, and open-loop request volume in
benchmark dry-run plans.  Compare benchmark reports by scenario position plus
name and type so duplicate scenario names cannot be matched to the wrong base
result.

https://github.com/fedify-dev/fedify/issues/744
https://github.com/fedify-dev/fedify/issues/786

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/action.test.ts  | 39 ++++++++++++++++
 packages/cli/src/bench/action.ts       | 26 ++++++++++-
 packages/cli/src/bench/compare.test.ts | 64 ++++++++++++++++++++++++++
 packages/cli/src/bench/compare.ts      | 11 +++--
 4 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/packages/cli/src/bench/action.test.ts b/packages/cli/src/bench/action.test.ts
index da4be911a..9fd584736 100644
--- a/packages/cli/src/bench/action.test.ts
+++ b/packages/cli/src/bench/action.test.ts
@@ -161,6 +161,8 @@ test("runBench - dry run prints a plan and sends nothing", async () => {
     });
     assert.strictEqual(code, 0);
     assert.match(output, /dry run/i);
+    assert.match(output, /runs 3/);
+    assert.match(output, /total duration 750ms/);
     assert.match(output, /\/inbox/);
     assert.match(output, /No benchmark load was sent/);
     const requests = target.requests();
@@ -171,6 +173,43 @@ test("runBench - dry run prints a plan and sends nothing", async () => {
   }
 });
 
+test("runBench - dry run includes repeated open-loop request volume", async () => {
+  const target = await spawnBenchmarkTarget();
+  try {
+    const file = await writeSuite(`version: 1
+target: ${target.url.href}
+scenarios:
+  - name: inbox-open
+    type: inbox
+    recipient: "http://\${{ target.host }}/users/alice"
+    inbox: shared
+    load: { rate: 2/s }
+    duration: 500ms
+`);
+    let code = -1;
+    let output = "";
+    await runBench(command({ scenario: file, dryRun: true }), {
+      exit: (c) => {
+        code = c;
+      },
+      writeOutput: (c) => {
+        output = c;
+        return Promise.resolve();
+      },
+      log: () => {},
+    });
+    assert.strictEqual(code, 0);
+    assert.match(output, /runs 3/);
+    assert.match(output, /total duration 1500ms/);
+    assert.match(output, /estimated scheduled requests 3/);
+    const requests = target.requests();
+    assert.ok(requests.some((r) => r.method === "GET"));
+    assert.ok(!requests.some((r) => r.method === "POST"));
+  } finally {
+    await target.close();
+  }
+});
+
 test("runBench - repeats a scenario according to runs", async () => {
   const file = await writeSuite(`version: 1
 target: http://127.0.0.1:3000
diff --git a/packages/cli/src/bench/action.ts b/packages/cli/src/bench/action.ts
index 936ff1c75..c6b5b606b 100644
--- a/packages/cli/src/bench/action.ts
+++ b/packages/cli/src/bench/action.ts
@@ -455,7 +455,31 @@ function describePlan(scenario: ResolvedScenario): string {
   const load = scenario.load.kind === "open"
     ? `open-loop ${scenario.load.ratePerSec}/s ${scenario.load.arrival}`
     : `closed-loop concurrency ${scenario.load.concurrency}`;
-  return `${load}, duration ${scenario.durationMs}ms, signing ${scenario.signing}`;
+  const totalDurationMs = scenario.durationMs * scenario.runs;
+  const volume = describePlannedRequestVolume(scenario);
+  return [
+    load,
+    `duration ${scenario.durationMs}ms`,
+    `runs ${scenario.runs}`,
+    `total duration ${totalDurationMs}ms`,
+    ...(volume == null ? [] : [volume]),
+    `signing ${scenario.signing}`,
+  ].join(", ");
+}
+
+function describePlannedRequestVolume(
+  scenario: ResolvedScenario,
+): string | null {
+  if (scenario.load.kind !== "open") return null;
+  const estimatedRequests = scenario.load.ratePerSec *
+    (scenario.durationMs / 1000) * scenario.runs;
+  return `estimated scheduled requests ${formatPlanNumber(estimatedRequests)}`;
+}
+
+function formatPlanNumber(value: number): string {
+  if (Number.isInteger(value)) return String(value);
+  const formatted = value.toFixed(2).replace(/\.?0+$/, "");
+  return formatted === "" ? "0" : formatted;
 }
 
 async function describeDiscoveryPlan(
diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index e30eb0c22..ce8ae2941 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -205,6 +205,70 @@ test("buildCompareReport - fails regressions outside tolerance and noise", () =>
   assert.strictEqual(compare.passed, false);
 });
 
+test("buildCompareReport - matches duplicate scenario names by position", () => {
+  const base = report([
+    scenario({
+      name: "duplicate",
+      client: {
+        latencyMs: { p50: 100, p95: 200, p99: 210, mean: 120, max: 220 },
+      },
+      runs: [
+        runResult(200, 100),
+        runResult(200, 100),
+        runResult(200, 100),
+      ],
+    }),
+    scenario({
+      name: "duplicate",
+      client: {
+        latencyMs: { p50: 50, p95: 100, p99: 110, mean: 60, max: 120 },
+      },
+      runs: [
+        runResult(100, 100),
+        runResult(100, 100),
+        runResult(100, 100),
+      ],
+    }),
+  ]);
+  const head = report([
+    scenario({
+      name: "duplicate",
+      client: {
+        latencyMs: { p50: 115, p95: 230, p99: 240, mean: 130, max: 250 },
+      },
+      runs: [
+        runResult(230, 100),
+        runResult(230, 100),
+        runResult(230, 100),
+      ],
+    }),
+    scenario({
+      name: "duplicate",
+      client: {
+        latencyMs: { p50: 55, p95: 110, p99: 120, mean: 70, max: 130 },
+      },
+      runs: [
+        runResult(110, 100),
+        runResult(110, 100),
+        runResult(110, 100),
+      ],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0.2,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.deepEqual(compare.comparisons.map((c) => c.base), [200, 100]);
+  assert.deepEqual(compare.comparisons.map((c) => c.head), [230, 110]);
+  assert.ok(compare.comparisons.every((c) => c.pass));
+  assert.strictEqual(compare.passed, true);
+});
+
 test("buildCompareReport - keeps zero-median noise finite", () => {
   const base = report([
     scenario({
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index b13af0e1c..97b0d9819 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -257,11 +257,14 @@ function compareReports(
   head: BenchReport,
   maxRegression: number,
 ): ComparisonResult[] {
-  const baseScenarios = new Map(base.scenarios.map((s) => [s.name, s]));
   const results: ComparisonResult[] = [];
-  for (const headScenario of head.scenarios) {
-    const baseScenario = baseScenarios.get(headScenario.name);
-    if (baseScenario == null || baseScenario.type !== headScenario.type) {
+  for (let index = 0; index < head.scenarios.length; index++) {
+    const headScenario = head.scenarios[index];
+    const baseScenario = base.scenarios[index];
+    if (
+      baseScenario == null || baseScenario.name !== headScenario.name ||
+      baseScenario.type !== headScenario.type
+    ) {
       results.push(missingScenario(headScenario.name, maxRegression));
       continue;
     }

From b18c937e5e5617563800fc155c97eaf06f7ecbb2 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 14:31:53 +0900
Subject: [PATCH 06/19] Add a PR link to the changelog

---
 CHANGES.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 29cdb31ef..79c9e77af 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -322,7 +322,7 @@ To be released.
     band.  Benchmark scenarios now run three times by default and aggregate
     repeated runs with median latency/throughput and pessimistic correctness
     results.  This change is published as benchmark report schema version 3
-    and comparison report schema version 1.  [[#744], [#786]]
+    and comparison report schema version 1.  [[#744], [#786], [#804]]
 
 [#783]: https://github.com/fedify-dev/fedify/issues/783
 [#784]: https://github.com/fedify-dev/fedify/issues/784
@@ -330,6 +330,7 @@ To be released.
 [#786]: https://github.com/fedify-dev/fedify/issues/786
 [#801]: https://github.com/fedify-dev/fedify/pull/801
 [#802]: https://github.com/fedify-dev/fedify/pull/802
+[#804]: https://github.com/fedify-dev/fedify/pull/804
 
 ### @fedify/fixture
 

From 662c265d414e7a70bbc2381668a4ebd822721da2 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 14:56:16 +0900
Subject: [PATCH 07/19] Tighten benchmark review edge cases

Bound target shutdown after SIGKILL, keep readiness timeout errors tied
back to the abort reason, and make malformed comparison metrics fail
instead of throwing.  Tighten benchmark report schemas so embedded
targets are typed and repeated reports must include per-run data.

Add parser and public-export coverage requested in review.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407495512
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407495514
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407495515
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407499819
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407499824
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407499826
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407499830
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407499832

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/command.test.ts   |  5 +++
 packages/cli/src/bench/compare.test.ts   | 46 ++++++++++++++++++++++++
 packages/cli/src/bench/compare.ts        | 36 +++++++++++++++----
 packages/cli/src/bench/compare/schema.ts | 11 ++++++
 packages/cli/src/bench/mod.ts            |  1 +
 packages/cli/src/bench/result/schema.ts  |  7 ++++
 packages/cli/src/bench/schema.test.ts    | 21 +++++++++++
 schema/bench/compare-report-v1.json      | 25 +++++++++++++
 schema/bench/report-v3.json              | 21 ++++++++++-
 9 files changed, 165 insertions(+), 8 deletions(-)

diff --git a/packages/cli/src/bench/command.test.ts b/packages/cli/src/bench/command.test.ts
index 6f2af0708..5d91f5698 100644
--- a/packages/cli/src/bench/command.test.ts
+++ b/packages/cli/src/bench/command.test.ts
@@ -119,6 +119,11 @@ test("benchCommand - compare mode requires refs", () => {
   assert.ok(!result.success);
 });
 
+test("benchCommand - bare compare subcommand fails", () => {
+  const result = parse(benchCommand, [COMMAND, "compare"]);
+  assert.ok(!result.success);
+});
+
 test("benchCommand - invalid format value fails", () => {
   const result = parse(benchCommand, [COMMAND, FILE, "--format", "xml"]);
   assert.ok(!result.success);
diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index ce8ae2941..b7568dbf3 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -314,6 +314,23 @@ test("buildCompareReport - keeps zero-median noise finite", () => {
   );
 });
 
+test("buildCompareReport - missing client metrics fail comparisons", () => {
+  const malformed = scenario() as unknown as Record<string, unknown>;
+  delete malformed.client;
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: report([scenario()]),
+    headReport: report([malformed as unknown as ScenarioResult]),
+    maxRegression: 0.1,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.strictEqual(compare.comparisons[0].head, null);
+  assert.strictEqual(compare.comparisons[0].pass, false);
+  assert.strictEqual(compare.passed, false);
+});
+
 test("startBenchmarkTarget - keeps target stdout off stdout", async () => {
   let options: SpawnOptions | undefined;
   const child = fakeChildProcess();
@@ -352,6 +369,18 @@ test("stopTargetProcess - kills the Windows process tree", async () => {
   assert.deepEqual(kills, [[4321, "SIGTERM"]]);
 });
 
+test("stopTargetProcess - rejects when forced kill does not exit", async () => {
+  const child = fakeChildProcess(4321);
+  child.kill = () => true;
+  await assert.rejects(
+    stopTargetProcess(child, {
+      forceTimeoutMs: 1,
+      forceKillTimeoutMs: 1,
+    }),
+    /did not exit/,
+  );
+});
+
 test("waitReadyUrl - does not wait for streaming response bodies", async () => {
   let calls = 0;
   await waitReadyUrl(new URL("http://ready.test/health"), 100, {
@@ -391,6 +420,23 @@ test("waitReadyUrl - aborts a hanging fetch at the timeout", async () => {
   assert.ok(Date.now() - startedAt < 1000);
 });
 
+test("waitReadyUrl - prefers abort reason over transport errors", async () => {
+  await assert.rejects(
+    waitReadyUrl(new URL("http://ready.test/health"), 20, {
+      fetch: (_input, init) =>
+        new Promise<Response>((_resolve, reject) => {
+          init?.signal?.addEventListener(
+            "abort",
+            () => reject(new TypeError("transport failure")),
+            { once: true },
+          );
+        }),
+      sleep: () => Promise.resolve(),
+    }),
+    /ready URL timed out after 20ms/,
+  );
+});
+
 test("runBenchCompare - orchestrates worktrees and cleans up", async () => {
   const events: string[] = [];
   let code = -1;
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 97b0d9819..38d0a68bf 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -117,6 +117,7 @@ export interface StopTargetProcessOptions {
   ) => void;
   readonly killProcessGroup?: (pid: number, signal: NodeJS.Signals) => void;
   readonly forceTimeoutMs?: number;
+  readonly forceKillTimeoutMs?: number;
 }
 
 /** Dependencies for waiting until a benchmark target is ready. */
@@ -353,7 +354,8 @@ function metricValue(
       return scenario.deliveryThroughputPerSec ?? null;
   }
   if (metric.startsWith("latency.")) {
-    return latencyValue(scenario.client.latencyMs, metric.slice(8));
+    const latency = scenario.client?.latencyMs;
+    return latency == null ? null : latencyValue(latency, metric.slice(8));
   }
   if (metric.startsWith("signatureVerification.")) {
     return partialValue(
@@ -631,22 +633,38 @@ export function stopTargetProcess(
   const killProcessGroup = options.killProcessGroup ??
     ((pid, signal) => process.kill(pid, signal));
   const forceTimeoutMs = options.forceTimeoutMs ?? 5000;
-  return new Promise((resolve) => {
+  const forceKillTimeoutMs = options.forceKillTimeoutMs ?? forceTimeoutMs;
+  return new Promise((resolve, reject) => {
     if (child.exitCode != null || child.signalCode != null) {
       resolve();
       return;
     }
-    const timer = setTimeout(() => {
+    let forceKillTimer: ReturnType<typeof setTimeout> | undefined;
+    const clearTimers = () => {
+      clearTimeout(forceTimer);
+      if (forceKillTimer != null) clearTimeout(forceKillTimer);
+    };
+    const onExit = () => {
+      clearTimers();
+      resolve();
+    };
+    const forceTimer = setTimeout(() => {
       killTargetProcess(child, "SIGKILL", {
         platform,
         killWindowsProcessTree,
         killProcessGroup,
       });
+      forceKillTimer = setTimeout(() => {
+        child.removeListener("exit", onExit);
+        reject(
+          new Error(
+            `Benchmark target process ${child.pid ?? "<unknown>"} ` +
+              "did not exit after SIGKILL.",
+          ),
+        );
+      }, forceKillTimeoutMs);
     }, forceTimeoutMs);
-    child.once("exit", () => {
-      clearTimeout(timer);
-      resolve();
-    });
+    child.once("exit", onExit);
     killTargetProcess(child, "SIGTERM", {
       platform,
       killWindowsProcessTree,
@@ -723,6 +741,10 @@ export async function waitReadyUrl(
       if (response.status >= 200 && response.status < 400) return;
       lastError = new Error(`ready URL returned ${response.status}`);
     } catch (error) {
+      if (controller.signal.aborted) {
+        lastError = controller.signal.reason ?? error;
+        break;
+      }
       lastError = error;
     } finally {
       clearTimeout(timer);
diff --git a/packages/cli/src/bench/compare/schema.ts b/packages/cli/src/bench/compare/schema.ts
index b27f4a57d..3b65d6f74 100644
--- a/packages/cli/src/bench/compare/schema.ts
+++ b/packages/cli/src/bench/compare/schema.ts
@@ -100,11 +100,22 @@ export const compareReportSchemaV1 = {
         schemaVersion: { const: 3 },
         tool: { $ref: "#/$defs/tool" },
         environment: { $ref: "#/$defs/environment" },
+        target: { $ref: "#/$defs/target" },
         suite: { $ref: "#/$defs/suite" },
         passed: { type: "boolean" },
         scenarios: { type: "array" },
       },
     },
+    target: {
+      type: "object",
+      additionalProperties: false,
+      required: ["url", "statsAvailable"],
+      properties: {
+        url: { type: "string" },
+        fedifyVersion: { type: ["string", "null"] },
+        statsAvailable: { type: "boolean" },
+      },
+    },
     compareSide: {
       type: "object",
       additionalProperties: false,
diff --git a/packages/cli/src/bench/mod.ts b/packages/cli/src/bench/mod.ts
index e263d2c64..7d91a54fc 100644
--- a/packages/cli/src/bench/mod.ts
+++ b/packages/cli/src/bench/mod.ts
@@ -4,6 +4,7 @@ import type { BenchCommand } from "./command.ts";
 
 export { benchCommand } from "./command.ts";
 
+/** Runs a parsed benchmark command. */
 export function runBench(command: BenchCommand): Promise<void> {
   return command.mode === "compare"
     ? runBenchCompare(command)
diff --git a/packages/cli/src/bench/result/schema.ts b/packages/cli/src/bench/result/schema.ts
index f35178dfc..a0cdcd613 100644
--- a/packages/cli/src/bench/result/schema.ts
+++ b/packages/cli/src/bench/result/schema.ts
@@ -365,6 +365,13 @@ export const reportSchemaV3 = {
           items: { $ref: "#/$defs/scenarioRunResult" },
         },
       },
+      allOf: [{
+        if: {
+          required: ["runCount"],
+          properties: { runCount: { minimum: 2 } },
+        },
+        then: { required: ["runs"] },
+      }],
     },
   },
 } as const;
diff --git a/packages/cli/src/bench/schema.test.ts b/packages/cli/src/bench/schema.test.ts
index c8a7f5ea3..b73677c6d 100644
--- a/packages/cli/src/bench/schema.test.ts
+++ b/packages/cli/src/bench/schema.test.ts
@@ -99,6 +99,27 @@ for (const group of FIXTURE_GROUPS) {
   }
 }
 
+test("schema guard - report v3 requires runs for repeated scenarios", () => {
+  const file = join(FIXTURES, "reports", "inbox-report.json");
+  const report = parseSuiteText(readFileSync(file, "utf-8")) as {
+    scenarios: Array<Record<string, unknown>>;
+  };
+  report.scenarios[0].runCount = 2;
+  delete report.scenarios[0].runs;
+  const result = validators.get("report")!.validate(report);
+  assert.strictEqual(result.valid, false);
+});
+
+test("schema guard - compare report types embedded targets", () => {
+  const file = join(FIXTURES, "compare-reports", "basic.json");
+  const report = parseSuiteText(readFileSync(file, "utf-8")) as {
+    base: { report: Record<string, unknown> };
+  };
+  report.base.report.target = null;
+  const result = validators.get("compare-report")!.validate(report);
+  assert.strictEqual(result.valid, false);
+});
+
 // Guard 3: drift between embedded schema and the published file.
 for (const { name, fileName, schema } of PUBLISHED_SCHEMAS) {
   test(`schema guard - ${name} embedded schema matches published file`, () => {
diff --git a/schema/bench/compare-report-v1.json b/schema/bench/compare-report-v1.json
index 56a24efa6..108fe1537 100644
--- a/schema/bench/compare-report-v1.json
+++ b/schema/bench/compare-report-v1.json
@@ -142,6 +142,9 @@
         "environment": {
           "$ref": "#/$defs/environment"
         },
+        "target": {
+          "$ref": "#/$defs/target"
+        },
         "suite": {
           "$ref": "#/$defs/suite"
         },
@@ -153,6 +156,28 @@
         }
       }
     },
+    "target": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "url",
+        "statsAvailable"
+      ],
+      "properties": {
+        "url": {
+          "type": "string"
+        },
+        "fedifyVersion": {
+          "type": [
+            "string",
+            "null"
+          ]
+        },
+        "statsAvailable": {
+          "type": "boolean"
+        }
+      }
+    },
     "compareSide": {
       "type": "object",
       "additionalProperties": false,
diff --git a/schema/bench/report-v3.json b/schema/bench/report-v3.json
index ecb644cbc..2fffd812b 100644
--- a/schema/bench/report-v3.json
+++ b/schema/bench/report-v3.json
@@ -475,7 +475,26 @@
             "$ref": "#/$defs/scenarioRunResult"
           }
         }
-      }
+      },
+      "allOf": [
+        {
+          "if": {
+            "required": [
+              "runCount"
+            ],
+            "properties": {
+              "runCount": {
+                "minimum": 2
+              }
+            }
+          },
+          "then": {
+            "required": [
+              "runs"
+            ]
+          }
+        }
+      ]
     },
     "serializedHistogram": {
       "type": "object",

From d7ceb7c6b06f58823f502a0e409ad798487298a0 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 15:03:25 +0900
Subject: [PATCH 08/19] List benchmark schemas on schema index

Document every published benchmark schema on the schema landing page,
including the current scenario and report versions plus the comparison
report schema.  Keep the schema directory README aligned with the same
published files and source locations.

Assisted-by: Codex:gpt-5.5
---
 schema/README.md  | 12 +++++++++---
 schema/index.html | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/schema/README.md b/schema/README.md
index 7bd588534..4c27b1064 100644
--- a/schema/README.md
+++ b/schema/README.md
@@ -15,10 +15,14 @@ Current schemas:
     format (input).
  -  *bench/scenario-v1.json* — the version 1 `fedify bench` scenario suite
     format (input).
- -  *bench/report-v2.json* — the current `fedify bench` report format
+ -  *bench/report-v3.json* — the current `fedify bench` report format
+    (output).
+ -  *bench/report-v2.json* — the version 2 `fedify bench` report format
     (output).
  -  *bench/report-v1.json* — the version 1 `fedify bench` report format
     (output).
+ -  *bench/compare-report-v1.json* — the `fedify bench compare` report format
+    (output).
 
 
 Versioning: append-only and immutable
@@ -41,6 +45,7 @@ binary self-contained):
 
  -  *packages/cli/src/bench/scenario/schema.ts*
  -  *packages/cli/src/bench/result/schema.ts*
+ -  *packages/cli/src/bench/compare/schema.ts*
 
 The *.json* files here are generated from those objects.  After editing an
 embedded schema, regenerate the published copies:
@@ -50,8 +55,9 @@ deno task -f @fedify/cli generate-bench-schema
 ~~~~
 
 The matching TypeScript types live next to each schema
-(*packages/cli/src/bench/scenario/types.ts* and
-*packages/cli/src/bench/result/model.ts*); keep them in sync with the schema.
+(*packages/cli/src/bench/scenario/types.ts*,
+*packages/cli/src/bench/result/model.ts*, and
+*packages/cli/src/bench/compare.ts*); keep them in sync with the schema.
 
 
 Guards
diff --git a/schema/index.html b/schema/index.html
index 54ff4cc44..939aefc09 100644
--- a/schema/index.html
+++ b/schema/index.html
@@ -132,16 +132,43 @@ <h1>Fedify JSON Schemas</h1>
 
       <h2>Benchmarking (<code>fedify bench</code>)</h2>
       <ul class="schemas">
+        <li>
+          <a href="./bench/scenario-v2.json">bench/scenario-v2.json</a>
+          <div class="desc">
+            The current benchmark scenario suite format (input).  YAML or JSON.
+          </div>
+        </li>
         <li>
           <a href="./bench/scenario-v1.json">bench/scenario-v1.json</a>
           <div class="desc">
-            The benchmark scenario suite format (input).  YAML or JSON.
+            The previous benchmark scenario suite format (input).  YAML or
+            JSON.
+          </div>
+        </li>
+        <li>
+          <a href="./bench/report-v3.json">bench/report-v3.json</a>
+          <div class="desc">
+            The current benchmark report format (output).  The canonical
+            machine form.
+          </div>
+        </li>
+        <li>
+          <a href="./bench/report-v2.json">bench/report-v2.json</a>
+          <div class="desc">
+            The previous benchmark report format (output).
           </div>
         </li>
         <li>
           <a href="./bench/report-v1.json">bench/report-v1.json</a>
           <div class="desc">
-            The benchmark report format (output).  The canonical machine form.
+            The original benchmark report format (output).
+          </div>
+        </li>
+        <li>
+          <a href="./bench/compare-report-v1.json">bench/compare-report-v1.json</a>
+          <div class="desc">
+            The benchmark comparison report format generated by
+            <code>fedify bench compare</code>.
           </div>
         </li>
       </ul>
@@ -151,7 +178,7 @@ <h2>Editor support</h2>
         Add a schema reference to your scenario file for autocomplete and
         validation in editors with the YAML Language Server:
       </p>
-      <pre><code># yaml-language-server: $schema=https://json-schema.fedify.dev/bench/scenario-v1.json
+      <pre><code># yaml-language-server: $schema=https://json-schema.fedify.dev/bench/scenario-v2.json
 version: 1
 target: http://localhost:3000
 # …</code></pre>

From 7b880cc7c42f558215d4f58d48742d3e02f893b9 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 15:11:39 +0900
Subject: [PATCH 09/19] Harden benchmark compare cleanup

Match scenarios by name, type, and occurrence so reordered suites do not
produce false missing-scenario comparisons.  Clean partial worktree
registrations after failed checkout and treat child processes without a
pid as already stopped.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407517909
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407517910
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407517913

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 109 ++++++++++++++++++++++++-
 packages/cli/src/bench/compare.ts      |  75 ++++++++++++++---
 2 files changed, 171 insertions(+), 13 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index b7568dbf3..e4372c6ea 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -6,6 +6,7 @@ import test from "node:test";
 import type { BenchCompareCommand } from "./command.ts";
 import {
   buildCompareReport,
+  createBenchmarkWorktree,
   parseRegressionTolerance,
   runBenchCompare,
   startBenchmarkTarget,
@@ -205,7 +206,7 @@ test("buildCompareReport - fails regressions outside tolerance and noise", () =>
   assert.strictEqual(compare.passed, false);
 });
 
-test("buildCompareReport - matches duplicate scenario names by position", () => {
+test("buildCompareReport - matches duplicate scenario names by occurrence", () => {
   const base = report([
     scenario({
       name: "duplicate",
@@ -269,6 +270,69 @@ test("buildCompareReport - matches duplicate scenario names by position", () =>
   assert.strictEqual(compare.passed, true);
 });
 
+test("buildCompareReport - matches reordered scenarios by name and type", () => {
+  const base = report([
+    scenario({
+      name: "first",
+      client: {
+        latencyMs: { p50: 50, p95: 100, p99: 110, mean: 60, max: 120 },
+      },
+      runs: [
+        runResult(100, 100),
+        runResult(100, 100),
+        runResult(100, 100),
+      ],
+    }),
+    scenario({
+      name: "second",
+      client: {
+        latencyMs: { p50: 100, p95: 200, p99: 210, mean: 120, max: 220 },
+      },
+      runs: [
+        runResult(200, 100),
+        runResult(200, 100),
+        runResult(200, 100),
+      ],
+    }),
+  ]);
+  const head = report([
+    scenario({
+      name: "second",
+      client: {
+        latencyMs: { p50: 105, p95: 210, p99: 220, mean: 130, max: 230 },
+      },
+      runs: [
+        runResult(210, 100),
+        runResult(210, 100),
+        runResult(210, 100),
+      ],
+    }),
+    scenario({
+      name: "first",
+      client: {
+        latencyMs: { p50: 55, p95: 110, p99: 120, mean: 70, max: 130 },
+      },
+      runs: [
+        runResult(110, 100),
+        runResult(110, 100),
+        runResult(110, 100),
+      ],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0.2,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.deepEqual(compare.comparisons.map((c) => c.base), [200, 100]);
+  assert.deepEqual(compare.comparisons.map((c) => c.head), [210, 110]);
+  assert.ok(compare.comparisons.every((c) => c.pass));
+});
+
 test("buildCompareReport - keeps zero-median noise finite", () => {
   const base = report([
     scenario({
@@ -381,6 +445,49 @@ test("stopTargetProcess - rejects when forced kill does not exit", async () => {
   );
 });
 
+test("stopTargetProcess - resolves immediately without a pid", async () => {
+  const child = fakeChildProcess();
+  Object.defineProperty(child, "pid", { value: undefined });
+  let killed = false;
+  child.kill = () => {
+    killed = true;
+    return true;
+  };
+  await stopTargetProcess(child, {
+    forceTimeoutMs: 1,
+    forceKillTimeoutMs: 1,
+  });
+  assert.strictEqual(killed, false);
+});
+
+test("createBenchmarkWorktree - cleans partial registrations", async () => {
+  const calls: string[][] = [];
+  await assert.rejects(
+    createBenchmarkWorktree("missing-ref", "base", {
+      createTempDir: () => Promise.resolve("/tmp/fedify-bench-base-test"),
+      removePath: () => Promise.resolve(),
+      runGit: (args) => {
+        calls.push([...args]);
+        if (args[1] === "add") {
+          return Promise.reject(new Error("checkout failed"));
+        }
+        return Promise.resolve();
+      },
+    }),
+    /checkout failed/,
+  );
+  assert.deepEqual(calls, [
+    [
+      "worktree",
+      "add",
+      "--detach",
+      "/tmp/fedify-bench-base-test",
+      "missing-ref",
+    ],
+    ["worktree", "remove", "--force", "/tmp/fedify-bench-base-test"],
+  ]);
+});
+
 test("waitReadyUrl - does not wait for streaming response bodies", async () => {
   let calls = 0;
   await waitReadyUrl(new URL("http://ready.test/health"), 100, {
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 38d0a68bf..bfe3bde9d 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -126,6 +126,20 @@ export interface WaitReadyUrlDeps {
   readonly sleep?: (ms: number) => Promise<void>;
 }
 
+type CreateTempDir = (prefix: string) => Promise<string>;
+type RemovePath = (
+  path: string,
+  options: { readonly recursive: boolean; readonly force: boolean },
+) => Promise<void>;
+type RunGit = (args: readonly string[]) => Promise<void>;
+
+/** Dependencies for creating benchmark comparison worktrees. */
+export interface CreateBenchmarkWorktreeDeps {
+  readonly createTempDir?: CreateTempDir;
+  readonly removePath?: RemovePath;
+  readonly runGit?: RunGit;
+}
+
 /** Runs `fedify bench compare`. */
 export async function runBenchCompare(
   command: BenchCompareCommand,
@@ -259,13 +273,23 @@ function compareReports(
   maxRegression: number,
 ): ComparisonResult[] {
   const results: ComparisonResult[] = [];
-  for (let index = 0; index < head.scenarios.length; index++) {
-    const headScenario = head.scenarios[index];
-    const baseScenario = base.scenarios[index];
-    if (
-      baseScenario == null || baseScenario.name !== headScenario.name ||
-      baseScenario.type !== headScenario.type
-    ) {
+  const baseByScenario = new Map<string, ScenarioResult[]>();
+  for (const baseScenario of base.scenarios) {
+    const key = comparisonScenarioKey(baseScenario);
+    const scenarios = baseByScenario.get(key);
+    if (scenarios == null) {
+      baseByScenario.set(key, [baseScenario]);
+    } else {
+      scenarios.push(baseScenario);
+    }
+  }
+  const headCounts = new Map<string, number>();
+  for (const headScenario of head.scenarios) {
+    const key = comparisonScenarioKey(headScenario);
+    const occurrence = headCounts.get(key) ?? 0;
+    headCounts.set(key, occurrence + 1);
+    const baseScenario = baseByScenario.get(key)?.[occurrence];
+    if (baseScenario == null) {
       results.push(missingScenario(headScenario.name, maxRegression));
       continue;
     }
@@ -278,6 +302,10 @@ function compareReports(
   return results;
 }
 
+function comparisonScenarioKey(scenario: ScenarioResult): string {
+  return `${scenario.name}\0${scenario.type}`;
+}
+
 function comparisonMetrics(scenario: ScenarioResult): string[] {
   const fromExpect = scenario.expectations
     .map((e) => e.metric)
@@ -561,13 +589,34 @@ async function defaultRunBenchInWorktree(
   return JSON.parse(output) as BenchReport;
 }
 
-async function defaultCreateWorktree(
+function defaultCreateWorktree(
   ref: string,
   label: "base" | "head",
 ): Promise<string> {
-  const path = await mkdtemp(join(tmpdir(), `fedify-bench-${label}-`));
-  await rm(path, { recursive: true, force: true });
-  await runGit(["worktree", "add", "--detach", path, ref]);
+  return createBenchmarkWorktree(ref, label);
+}
+
+/** Creates a detached Git worktree for one side of a benchmark comparison. */
+export async function createBenchmarkWorktree(
+  ref: string,
+  label: "base" | "head",
+  deps: CreateBenchmarkWorktreeDeps = {},
+): Promise<string> {
+  const createTempDir = deps.createTempDir ?? mkdtemp;
+  const removePath = deps.removePath ?? rm;
+  const run = deps.runGit ?? runGit;
+  const path = await createTempDir(join(tmpdir(), `fedify-bench-${label}-`));
+  await removePath(path, { recursive: true, force: true });
+  try {
+    await run(["worktree", "add", "--detach", path, ref]);
+  } catch (error) {
+    try {
+      await run(["worktree", "remove", "--force", path]);
+    } catch {
+      // Preserve the original checkout failure.
+    }
+    throw error;
+  }
   return path;
 }
 
@@ -635,7 +684,9 @@ export function stopTargetProcess(
   const forceTimeoutMs = options.forceTimeoutMs ?? 5000;
   const forceKillTimeoutMs = options.forceKillTimeoutMs ?? forceTimeoutMs;
   return new Promise((resolve, reject) => {
-    if (child.exitCode != null || child.signalCode != null) {
+    if (
+      child.pid == null || child.exitCode != null || child.signalCode != null
+    ) {
       resolve();
       return;
     }

From ce4b900033163958aaa208c83d90335e70b0d4e3 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 15:34:19 +0900
Subject: [PATCH 10/19] Harden benchmark compare review fixes

Keep compare's derived target separate from the explicit CLI target
signal, so unsafe overrides still require --target.  Also handle zero
throughput baselines as improvements and let Windows targets receive a
graceful taskkill before forced cleanup.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407530674
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407530675
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407539849

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/action.ts       |  13 ++-
 packages/cli/src/bench/compare.test.ts | 107 +++++++++++++++++++++++++
 packages/cli/src/bench/compare.ts      |  29 ++++++-
 3 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/packages/cli/src/bench/action.ts b/packages/cli/src/bench/action.ts
index c6b5b606b..0a7704ec0 100644
--- a/packages/cli/src/bench/action.ts
+++ b/packages/cli/src/bench/action.ts
@@ -51,6 +51,10 @@ import {
 } from "./server/synthetic.ts";
 import { convertUrlIfHandle } from "../webfinger/lib.ts";
 
+type BenchRunRuntimeCommand = BenchRunCommand & {
+  readonly explicitCliTarget?: boolean;
+};
+
 /** Injectable dependencies for {@link runBench}, overridable in tests. */
 export interface RunBenchDeps {
   /** Terminates the process with an exit code. */
@@ -77,7 +81,7 @@ export interface RunBenchDeps {
  * @param deps Injectable dependencies for testing.
  */
 export default async function runBench(
-  command: BenchRunCommand,
+  command: BenchRunRuntimeCommand,
   deps: RunBenchDeps = {},
 ): Promise<void> {
   // Set the exit code rather than terminating, so cleanup (closing the fleet)
@@ -92,6 +96,7 @@ export default async function runBench(
   // stats reads, and the runners' inbox/WebFinger requests — not just the
   // document loader, so a target that inspects the UA sees it on every request.
   const fetchImpl = withUserAgent(deps.fetch ?? fetch, command.userAgent);
+  const explicitCliTarget = command.explicitCliTarget ?? command.target != null;
 
   // Loading, validation, and normalization failures are all user-facing
   // configuration errors.
@@ -137,7 +142,7 @@ export default async function runBench(
         tier,
         benchmarkMode: probe.benchmarkMode,
         allowUnsafe: command.allowUnsafeTarget,
-        explicitCliTarget: command.target != null,
+        explicitCliTarget,
         scenarios: unsafeOverrideScenarios(validated),
       });
     }
@@ -191,7 +196,7 @@ export default async function runBench(
       targetOrigin: suite.target.origin,
       targetBenchmarkMode: probe.benchmarkMode,
       allowUnsafe: command.allowUnsafeTarget,
-      explicitCliTarget: command.target != null,
+      explicitCliTarget,
       destinationTier,
       defaults: validated.defaults,
     });
@@ -220,7 +225,7 @@ export default async function runBench(
       targetOrigin: suite.target.origin,
       targetBenchmarkMode: probe.benchmarkMode,
       allowUnsafe: command.allowUnsafeTarget,
-      explicitCliTarget: command.target != null,
+      explicitCliTarget,
       destinationTier,
       defaults: validated.defaults,
     });
diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index e4372c6ea..6c97c2937 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -2,6 +2,9 @@ import assert from "node:assert/strict";
 import { Buffer } from "node:buffer";
 import type { ChildProcess, SpawnOptions } from "node:child_process";
 import { EventEmitter } from "node:events";
+import { mkdtemp, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
 import test from "node:test";
 import type { BenchCompareCommand } from "./command.ts";
 import {
@@ -12,6 +15,7 @@ import {
   startBenchmarkTarget,
   stopTargetProcess,
   waitReadyUrl,
+  windowsTaskkillArgs,
 } from "./compare.ts";
 import type { BenchReport, ScenarioResult } from "./result/model.ts";
 
@@ -123,6 +127,13 @@ function command(overrides: Partial<BenchCompareCommand>): BenchCompareCommand {
   };
 }
 
+async function writeSuite(content: string): Promise<string> {
+  const dir = await mkdtemp(join(tmpdir(), "fedify-bench-compare-"));
+  const path = join(dir, "suite.yaml");
+  await writeFile(path, content, { encoding: "utf-8" });
+  return path;
+}
+
 function fakeChildProcess(pid = 1234): FakeChildProcess {
   const child = new EventEmitter() as FakeChildProcess;
   Object.defineProperties(child, {
@@ -206,6 +217,47 @@ test("buildCompareReport - fails regressions outside tolerance and noise", () =>
   assert.strictEqual(compare.passed, false);
 });
 
+test("buildCompareReport - treats positive throughput after zero as passing", () => {
+  const throughputExpectation = (actual: number) =>
+    ({
+      metric: "throughputPerSec",
+      op: "gte",
+      threshold: 0,
+      unit: "/s",
+      actual,
+      severity: "fail",
+      pass: true,
+    }) as const;
+  const base = report([
+    scenario({
+      throughputPerSec: 0,
+      expectations: [throughputExpectation(0)],
+      runs: [runResult(100, 0), runResult(100, 0), runResult(100, 0)],
+    }),
+  ]);
+  const head = report([
+    scenario({
+      throughputPerSec: 10,
+      expectations: [throughputExpectation(10)],
+      runs: [runResult(100, 10), runResult(100, 10), runResult(100, 10)],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.strictEqual(compare.comparisons.length, 1);
+  assert.strictEqual(compare.comparisons[0].metric, "throughputPerSec");
+  assert.strictEqual(compare.comparisons[0].regression, 0);
+  assert.strictEqual(compare.comparisons[0].pass, true);
+  assert.strictEqual(compare.passed, true);
+});
+
 test("buildCompareReport - matches duplicate scenario names by occurrence", () => {
   const base = report([
     scenario({
@@ -433,6 +485,20 @@ test("stopTargetProcess - kills the Windows process tree", async () => {
   assert.deepEqual(kills, [[4321, "SIGTERM"]]);
 });
 
+test("windowsTaskkillArgs - only force kills on SIGKILL", () => {
+  assert.deepEqual(windowsTaskkillArgs(4321, "SIGTERM"), [
+    "/pid",
+    "4321",
+    "/T",
+  ]);
+  assert.deepEqual(windowsTaskkillArgs(4321, "SIGKILL"), [
+    "/pid",
+    "4321",
+    "/T",
+    "/F",
+  ]);
+});
+
 test("stopTargetProcess - rejects when forced kill does not exit", async () => {
   const child = fakeChildProcess(4321);
   child.kill = () => true;
@@ -602,3 +668,44 @@ test("runBenchCompare - orchestrates worktrees and cleans up", async () => {
     "remove:/tmp/base",
   ]);
 });
+
+test("runBenchCompare - does not treat derived target as explicit", async () => {
+  const file = await writeSuite(`version: 1
+target: https://example.com
+defaults:
+  load: { rate: 1/s }
+  duration: 1ms
+scenarios:
+  - name: wf
+    type: webfinger
+    recipient: "acct:alice@example.com"
+`);
+  const events: string[] = [];
+  let code = -1;
+  await runBenchCompare(
+    command({
+      file,
+      readyUrl: "https://example.com/health",
+      allowUnsafeTarget: true,
+    }),
+    {
+      exit: (c) => {
+        code = c;
+      },
+      writeOutput: () => Promise.resolve(),
+      log: (message) => events.push(`compare:${message}`),
+      createWorktree: (_ref, label) => Promise.resolve(`/tmp/${label}`),
+      removeWorktree: () => Promise.resolve(),
+      startTarget: () => Promise.resolve({ stop: () => Promise.resolve() }),
+      waitReady: () => Promise.resolve(),
+      benchDeps: {
+        log: (message) => events.push(`bench:${message}`),
+        fetch: () =>
+          Promise.resolve(new Response("not found", { status: 404 })),
+        resolveTargetAddresses: () => Promise.resolve(["93.184.216.34"]),
+      },
+    },
+  );
+  assert.strictEqual(code, 2);
+  assert.match(events.join("\n"), /--target/);
+});
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index bfe3bde9d..a2b4c97bd 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -101,6 +101,10 @@ type SpawnTarget = (
   options: SpawnOptions,
 ) => ChildProcess;
 
+type BenchRunCompareCommand = BenchRunCommand & {
+  readonly explicitCliTarget: boolean;
+};
+
 /** Options for starting a benchmark target. */
 export interface StartBenchmarkTargetOptions {
   readonly platform?: NodeJS.Platform;
@@ -445,9 +449,16 @@ function regressionRatio(
   head: number,
   direction: ComparisonResult["direction"],
 ): number | null {
-  if (!Number.isFinite(base) || !Number.isFinite(head) || base <= 0) {
+  if (!Number.isFinite(base) || !Number.isFinite(head)) {
+    return null;
+  }
+  if (base < 0) {
     return base === head ? 0 : null;
   }
+  if (base === 0) {
+    if (base === head) return 0;
+    return direction === "higher-is-better" && head > base ? 0 : null;
+  }
   return direction === "higher-is-better"
     ? (base - head) / base
     : (head - base) / base;
@@ -561,7 +572,7 @@ async function defaultRunBenchInWorktree(
 ): Promise<BenchReport> {
   let output = "";
   let exitCode = 0;
-  const runCommand: BenchRunCommand = {
+  const runCommand: BenchRunCompareCommand = {
     command: "bench",
     mode: "run",
     scenario: input.command.file,
@@ -572,6 +583,7 @@ async function defaultRunBenchInWorktree(
     advertiseHost: input.command.advertiseHost,
     allowUnsafeTarget: input.command.allowUnsafeTarget,
     userAgent: input.command.userAgent,
+    explicitCliTarget: input.command.target != null,
   };
   await runBench(runCommand, {
     ...benchDeps,
@@ -755,15 +767,24 @@ function killTargetProcess(
 
 function defaultKillWindowsProcessTree(
   pid: number,
-  _signal: NodeJS.Signals,
+  signal: NodeJS.Signals,
 ): void {
-  const child = spawn("taskkill", ["/pid", String(pid), "/T", "/F"], {
+  const child = spawn("taskkill", windowsTaskkillArgs(pid, signal), {
     stdio: "ignore",
     windowsHide: true,
   });
   child.on("error", () => {});
 }
 
+export function windowsTaskkillArgs(
+  pid: number,
+  signal: NodeJS.Signals,
+): string[] {
+  const args = ["/pid", String(pid), "/T"];
+  if (signal === "SIGKILL") args.push("/F");
+  return args;
+}
+
 async function defaultWaitReady(url: URL, timeoutMs: number): Promise<void> {
   return await waitReadyUrl(url, timeoutMs);
 }

From ab34e0e9860a547658a91bc844dcc679f9495222 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 15:48:45 +0900
Subject: [PATCH 11/19] Refine benchmark compare edge handling

Allow head-only benchmark scenarios to pass without a baseline so adding
coverage to a suite does not fail the regression gate by itself.  Also
remove the temporary worktree directory directly when checkout fails, even
if Git cannot remove an unregistered worktree.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407552812
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407552813

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 36 +++++++++++++++++++++++++-
 packages/cli/src/bench/compare.ts      | 11 +++++---
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index 6c97c2937..555b2f3da 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -258,6 +258,32 @@ test("buildCompareReport - treats positive throughput after zero as passing", ()
   assert.strictEqual(compare.passed, true);
 });
 
+test("buildCompareReport - passes new head scenarios without a baseline", () => {
+  const base = report([scenario({ name: "existing" })]);
+  const head = report([
+    scenario({ name: "existing" }),
+    scenario({ name: "new-scenario" }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0.1,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  const newScenario = compare.comparisons.find((comparison) =>
+    comparison.scenario === "new-scenario"
+  );
+  assert.ok(newScenario);
+  assert.strictEqual(newScenario.metric, "scenario");
+  assert.strictEqual(newScenario.base, null);
+  assert.strictEqual(newScenario.head, null);
+  assert.strictEqual(newScenario.pass, true);
+  assert.strictEqual(compare.passed, true);
+});
+
 test("buildCompareReport - matches duplicate scenario names by occurrence", () => {
   const base = report([
     scenario({
@@ -528,10 +554,14 @@ test("stopTargetProcess - resolves immediately without a pid", async () => {
 
 test("createBenchmarkWorktree - cleans partial registrations", async () => {
   const calls: string[][] = [];
+  const removals: string[] = [];
   await assert.rejects(
     createBenchmarkWorktree("missing-ref", "base", {
       createTempDir: () => Promise.resolve("/tmp/fedify-bench-base-test"),
-      removePath: () => Promise.resolve(),
+      removePath: (path) => {
+        removals.push(path);
+        return Promise.resolve();
+      },
       runGit: (args) => {
         calls.push([...args]);
         if (args[1] === "add") {
@@ -552,6 +582,10 @@ test("createBenchmarkWorktree - cleans partial registrations", async () => {
     ],
     ["worktree", "remove", "--force", "/tmp/fedify-bench-base-test"],
   ]);
+  assert.deepEqual(removals, [
+    "/tmp/fedify-bench-base-test",
+    "/tmp/fedify-bench-base-test",
+  ]);
 });
 
 test("waitReadyUrl - does not wait for streaming response bodies", async () => {
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index a2b4c97bd..bc9ce4b7b 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -294,7 +294,7 @@ function compareReports(
     headCounts.set(key, occurrence + 1);
     const baseScenario = baseByScenario.get(key)?.[occurrence];
     if (baseScenario == null) {
-      results.push(missingScenario(headScenario.name, maxRegression));
+      results.push(newScenario(headScenario.name, maxRegression));
       continue;
     }
     for (const metric of comparisonMetrics(headScenario)) {
@@ -358,7 +358,7 @@ function compareMetric(
   };
 }
 
-function missingScenario(
+function newScenario(
   scenario: string,
   maxRegression: number,
 ): ComparisonResult {
@@ -371,7 +371,7 @@ function missingScenario(
     regression: null,
     noiseBand: 0,
     allowedRegression: maxRegression,
-    pass: false,
+    pass: true,
   };
 }
 
@@ -627,6 +627,11 @@ export async function createBenchmarkWorktree(
     } catch {
       // Preserve the original checkout failure.
     }
+    try {
+      await removePath(path, { recursive: true, force: true });
+    } catch {
+      // Preserve the original checkout failure.
+    }
     throw error;
   }
   return path;

From ccca8b17dfb133c316dae6190084d9c383b9a27e Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 16:01:19 +0900
Subject: [PATCH 12/19] Require explicit runs for unsafe bench

The public-target unsafe override already required an explicit target,
load, and duration.  Include the repeated-run count in that contract so
the default run multiplier cannot expand public benchmark traffic without
an explicit operator choice.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407570555

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/action.test.ts      | 41 ++++++++++++++++++++++
 packages/cli/src/bench/action.ts           |  2 ++
 packages/cli/src/bench/safety/gate.test.ts | 23 ++++++++++++
 packages/cli/src/bench/safety/gate.ts      | 14 ++++++--
 4 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/packages/cli/src/bench/action.test.ts b/packages/cli/src/bench/action.test.ts
index 9fd584736..9120f5b77 100644
--- a/packages/cli/src/bench/action.test.ts
+++ b/packages/cli/src/bench/action.test.ts
@@ -932,6 +932,46 @@ scenarios:
   }
 });
 
+test("runBench - unsafe public inbox destination needs explicit runs", async () => {
+  const target = await spawnBenchmarkTarget();
+  try {
+    const file = await writeSuite(`version: 1
+target: ${target.url.href}
+scenarios:
+  - name: inbox-shared
+    type: inbox
+    recipient: "${new URL("/users/alice", target.url).href}"
+    inbox: "https://prod.example/inbox"
+    load: { rate: 1/s }
+    duration: 1ms
+`);
+    let code = -1;
+    let message = "";
+    await runBench(
+      command({
+        scenario: file,
+        target: target.url.href,
+        allowUnsafeTarget: true,
+        advertiseHost: "127.0.0.1",
+      }),
+      {
+        exit: (c) => {
+          code = c;
+        },
+        writeOutput: () => Promise.resolve(),
+        log: (m) => {
+          message = m;
+        },
+        resolveTargetAddresses: resolvePublicHost,
+      },
+    );
+    assert.strictEqual(code, 2);
+    assert.match(message, /runs/);
+  } finally {
+    await target.close();
+  }
+});
+
 test("runBench - unsafe public inbox destination honors suite defaults", async () => {
   const target = await spawnBenchmarkTarget();
   try {
@@ -939,6 +979,7 @@ test("runBench - unsafe public inbox destination honors suite defaults", async (
 target: ${target.url.href}
 defaults:
   duration: 1ms
+  runs: 1
   load: { rate: 1/s }
 scenarios:
   - name: inbox-shared
diff --git a/packages/cli/src/bench/action.ts b/packages/cli/src/bench/action.ts
index 0a7704ec0..c100d22b0 100644
--- a/packages/cli/src/bench/action.ts
+++ b/packages/cli/src/bench/action.ts
@@ -694,11 +694,13 @@ function unsafeOverrideScenario(
 ): Parameters<typeof assertUnsafeOverrideAllowed>[0]["scenarios"][number] {
   const defaultDuration = defaults?.duration != null;
   const defaultLoad = hasExplicitLoad(defaults?.load);
+  const defaultRuns = defaults?.runs != null;
   const raw = "raw" in scenario ? scenario.raw : scenario;
   return {
     name: scenario.name,
     explicitDuration: raw.duration != null || defaultDuration,
     explicitLoad: hasExplicitLoad(raw.load) || defaultLoad,
+    explicitRuns: raw.runs != null || defaultRuns,
   };
 }
 
diff --git a/packages/cli/src/bench/safety/gate.test.ts b/packages/cli/src/bench/safety/gate.test.ts
index ea496dbe4..002e944c7 100644
--- a/packages/cli/src/bench/safety/gate.test.ts
+++ b/packages/cli/src/bench/safety/gate.test.ts
@@ -73,6 +73,7 @@ test("assertUnsafeOverrideAllowed - unsafe flag needs an explicit CLI target", (
           name: "wf",
           explicitDuration: true,
           explicitLoad: true,
+          explicitRuns: true,
         }],
       }),
     (error: unknown) =>
@@ -92,6 +93,7 @@ test("assertUnsafeOverrideAllowed - unsafe public defaults need explicit load",
           name: "wf",
           explicitDuration: true,
           explicitLoad: false,
+          explicitRuns: true,
         }],
       }),
     (error: unknown) =>
@@ -111,6 +113,7 @@ test("assertUnsafeOverrideAllowed - unsafe public defaults need explicit duratio
           name: "wf",
           explicitDuration: false,
           explicitLoad: true,
+          explicitRuns: true,
         }],
       }),
     (error: unknown) =>
@@ -118,6 +121,26 @@ test("assertUnsafeOverrideAllowed - unsafe public defaults need explicit duratio
   );
 });
 
+test("assertUnsafeOverrideAllowed - unsafe public defaults need explicit runs", () => {
+  assert.throws(
+    () =>
+      assertUnsafeOverrideAllowed({
+        tier: "public",
+        benchmarkMode: false,
+        allowUnsafe: true,
+        explicitCliTarget: true,
+        scenarios: [{
+          name: "wf",
+          explicitDuration: true,
+          explicitLoad: true,
+          explicitRuns: false,
+        }],
+      }),
+    (error: unknown) =>
+      error instanceof UnsafeTargetError && /runs/.test(error.message),
+  );
+});
+
 test("assertUnsafeOverrideAllowed - safe targets do not need unsafe metadata", () => {
   assert.doesNotThrow(() =>
     assertUnsafeOverrideAllowed({
diff --git a/packages/cli/src/bench/safety/gate.ts b/packages/cli/src/bench/safety/gate.ts
index 11c87ab8a..203e9850b 100644
--- a/packages/cli/src/bench/safety/gate.ts
+++ b/packages/cli/src/bench/safety/gate.ts
@@ -54,6 +54,8 @@ export interface UnsafeOverrideScenario {
   readonly explicitDuration: boolean;
   /** Whether the scenario or suite explicitly selected a load model. */
   readonly explicitLoad: boolean;
+  /** Whether the scenario or suite explicitly set the run count. */
+  readonly explicitRuns: boolean;
 }
 
 /** The inputs for validating an unsafe public-target override. */
@@ -75,8 +77,9 @@ export interface UnsafeOverrideContext {
  *
  * The override is only meaningful for a public target that does not advertise
  * benchmark mode.  In that caution tier, the operator must name the target on
- * the command line for this run and must explicitly set load and duration, so
- * the built-in defaults cannot accidentally create a long public benchmark.
+ * the command line for this run and must explicitly set load, duration, and
+ * runs, so the built-in defaults cannot accidentally create a long public
+ * benchmark.
  * @param context The unsafe override decision inputs.
  * @throws {UnsafeTargetError} If the unsafe override is too broad.
  */
@@ -110,6 +113,13 @@ export function assertUnsafeOverrideAllowed(
           "--allow-unsafe-target against a public target.",
       );
     }
+    if (!scenario.explicitRuns) {
+      throw new UnsafeTargetError(
+        `Scenario "${scenario.name}" uses the built-in benchmark runs ` +
+          "default.  Set runs explicitly before using --allow-unsafe-target " +
+          "against a public target.",
+      );
+    }
   }
 }
 

From 82c39d39c866f4313dc8cdd71bd6c9325dcf39ad Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 16:03:12 +0900
Subject: [PATCH 13/19] Clean up benchmark compare helpers

Merge the duplicate fs/promises imports, document the taskkill argument
helper, and tolerate custom response bodies without cancel methods while
polling readiness endpoints.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407567713
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407567721
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407567726

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 10 ++++++++++
 packages/cli/src/bench/compare.ts      |  6 +++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index 555b2f3da..b915e10c1 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -608,6 +608,16 @@ test("waitReadyUrl - does not wait for streaming response bodies", async () => {
   assert.strictEqual(calls, 1);
 });
 
+test("waitReadyUrl - tolerates response bodies without cancel", async () => {
+  await waitReadyUrl(new URL("http://ready.test/health"), 100, {
+    fetch: () =>
+      Promise.resolve({
+        status: 200,
+        body: {},
+      } as Response),
+  });
+});
+
 test("waitReadyUrl - aborts a hanging fetch at the timeout", async () => {
   const startedAt = Date.now();
   await assert.rejects(
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index bc9ce4b7b..56579eaa6 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -3,8 +3,7 @@ import {
   spawn,
   type SpawnOptions,
 } from "node:child_process";
-import { mkdtemp, rm } from "node:fs/promises";
-import { writeFile } from "node:fs/promises";
+import { mkdtemp, rm, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
 import process from "node:process";
@@ -781,6 +780,7 @@ function defaultKillWindowsProcessTree(
   child.on("error", () => {});
 }
 
+/** Builds the Windows `taskkill` arguments used for target cleanup. */
 export function windowsTaskkillArgs(
   pid: number,
   signal: NodeJS.Signals,
@@ -814,7 +814,7 @@ export async function waitReadyUrl(
     }, remainingMs);
     try {
       const response = await fetchReady(url, { signal: controller.signal });
-      void response.body?.cancel().catch(() => {});
+      void response.body?.cancel?.().catch(() => {});
       if (response.status >= 200 && response.status < 400) return;
       lastError = new Error(`ready URL returned ${response.status}`);
     } catch (error) {

From de36221102837f2a4f2575e24acce1c78f5005e6 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 16:24:41 +0900
Subject: [PATCH 14/19] Refine benchmark report edge cases

Treat one-millisecond latency movement from a zero baseline as noise,
preserve by-standard report detail across repeated runs, and avoid
publishing a pooled aggregate histogram that disagrees with the reported
median run latency fields.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407580718
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407580721
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407586582

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts      | 45 +++++++++++++++++
 packages/cli/src/bench/compare.ts           | 15 ++++--
 packages/cli/src/bench/result/build.test.ts | 56 +++++++++++++++++++++
 packages/cli/src/bench/result/build.ts      | 34 ++++++-------
 4 files changed, 127 insertions(+), 23 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index b915e10c1..572a2def5 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -258,6 +258,51 @@ test("buildCompareReport - treats positive throughput after zero as passing", ()
   assert.strictEqual(compare.passed, true);
 });
 
+test("buildCompareReport - tolerates tiny latency after zero baseline", () => {
+  const latencyExpectation = (actual: number) =>
+    ({
+      metric: "latency.p95",
+      op: "lt",
+      threshold: 10,
+      unit: "ms",
+      actual,
+      severity: "fail",
+      pass: true,
+    }) as const;
+  const base = report([
+    scenario({
+      client: {
+        latencyMs: { p50: 0, p95: 0, p99: 0, mean: 0, max: 0 },
+      },
+      expectations: [latencyExpectation(0)],
+      runs: [runResult(0, 100), runResult(0, 100), runResult(0, 100)],
+    }),
+  ]);
+  const head = report([
+    scenario({
+      client: {
+        latencyMs: { p50: 1, p95: 1, p99: 1, mean: 1, max: 1 },
+      },
+      expectations: [latencyExpectation(1)],
+      runs: [runResult(1, 100), runResult(1, 100), runResult(1, 100)],
+    }),
+  ]);
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: base,
+    headReport: head,
+    maxRegression: 0,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.strictEqual(compare.comparisons.length, 1);
+  assert.strictEqual(compare.comparisons[0].metric, "latency.p95");
+  assert.strictEqual(compare.comparisons[0].regression, 0);
+  assert.strictEqual(compare.comparisons[0].pass, true);
+  assert.strictEqual(compare.passed, true);
+});
+
 test("buildCompareReport - passes new head scenarios without a baseline", () => {
   const base = report([scenario({ name: "existing" })]);
   const head = report([
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 56579eaa6..7e8df29b8 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -20,6 +20,8 @@ import type {
 import { metricUnit } from "./result/expect/metrics.ts";
 import { describeError } from "../utils.ts";
 
+const ZERO_BASE_LATENCY_ALLOWANCE_MS = 1;
+
 /** A benchmark comparison report. */
 export interface BenchCompareReport {
   readonly $schema: string;
@@ -331,9 +333,8 @@ function compareMetric(
   metric: string,
   maxRegression: number,
 ): ComparisonResult {
-  const direction = metricUnit(metric) === "rate"
-    ? "higher-is-better"
-    : "lower-is-better";
+  const unit = metricUnit(metric);
+  const direction = unit === "rate" ? "higher-is-better" : "lower-is-better";
   const base = metricValue(baseScenario, metric);
   const head = metricValue(headScenario, metric);
   const noiseBand = Math.max(
@@ -342,7 +343,7 @@ function compareMetric(
   );
   const regression = base == null || head == null
     ? null
-    : regressionRatio(base, head, direction);
+    : regressionRatio(base, head, direction, unit);
   const allowedRegression = maxRegression + noiseBand;
   return {
     scenario: headScenario.name,
@@ -447,6 +448,7 @@ function regressionRatio(
   base: number,
   head: number,
   direction: ComparisonResult["direction"],
+  unit: ReturnType<typeof metricUnit>,
 ): number | null {
   if (!Number.isFinite(base) || !Number.isFinite(head)) {
     return null;
@@ -456,6 +458,11 @@ function regressionRatio(
   }
   if (base === 0) {
     if (base === head) return 0;
+    if (
+      direction === "lower-is-better" &&
+      unit === "ms" &&
+      head <= ZERO_BASE_LATENCY_ALLOWANCE_MS
+    ) return 0;
     return direction === "higher-is-better" && head > base ? 0 : null;
   }
   return direction === "higher-is-better"
diff --git a/packages/cli/src/bench/result/build.test.ts b/packages/cli/src/bench/result/build.test.ts
index ed5d7fdfe..813b0a765 100644
--- a/packages/cli/src/bench/result/build.test.ts
+++ b/packages/cli/src/bench/result/build.test.ts
@@ -9,6 +9,7 @@ import {
   detectEnvironment,
   type ScenarioMeasurement,
 } from "./build.ts";
+import { LogLinearHistogram } from "../metrics/histogram.ts";
 import { reportSchemaV3 } from "./schema.ts";
 
 function resolvedInbox() {
@@ -157,6 +158,61 @@ test("buildScenarioResult - fails repeated server gates with missing stats", ()
   assert.strictEqual(result.passed, false);
 });
 
+test("buildScenarioResult - keeps present by-standard repeated metrics", () => {
+  const first = measurement();
+  const missingStandard = measurement();
+  const third = measurement();
+  const result = buildScenarioResult(resolvedInbox(), [
+    {
+      ...first,
+      server: {
+        signatureVerificationMs: {
+          overall: first.server!.signatureVerificationMs!.overall,
+          byStandard: {
+            "rfc9421": { p50: 1, p95: 10, p99: 100 },
+          },
+        },
+      },
+    },
+    {
+      ...missingStandard,
+      server: {
+        signatureVerificationMs: {
+          overall: missingStandard.server!.signatureVerificationMs!.overall,
+        },
+      },
+    },
+    {
+      ...third,
+      server: {
+        signatureVerificationMs: {
+          overall: third.server!.signatureVerificationMs!.overall,
+          byStandard: {
+            "rfc9421": { p50: 3, p95: 30, p99: 300 },
+          },
+        },
+      },
+    },
+  ]);
+  assert.deepEqual(
+    result.server?.signatureVerificationMs?.byStandard?.["rfc9421"],
+    { p50: 2, p95: 20, p99: 200 },
+  );
+});
+
+test("buildScenarioResult - omits aggregate repeated-run histogram", () => {
+  const first = new LogLinearHistogram();
+  first.record(10);
+  const second = new LogLinearHistogram();
+  second.record(100);
+  const result = buildScenarioResult(resolvedInbox(), [
+    { ...measurement(), histogram: first.toJSON() },
+    { ...measurement(), histogram: second.toJSON() },
+  ]);
+  assert.strictEqual(result.histogram, undefined);
+  assert.ok(result.runs?.every((run) => run.histogram != null));
+});
+
 test("buildReport - gate passes only when all scenarios pass", () => {
   const ok = buildScenarioResult(resolvedInbox(), measurement());
   const bad = buildScenarioResult(resolvedInbox(), {
diff --git a/packages/cli/src/bench/result/build.ts b/packages/cli/src/bench/result/build.ts
index d3ba4bd8a..d5e3592f3 100644
--- a/packages/cli/src/bench/result/build.ts
+++ b/packages/cli/src/bench/result/build.ts
@@ -13,7 +13,6 @@ import { cpus } from "node:os";
 import process from "node:process";
 import metadata from "../../../deno.json" with { type: "json" };
 import type { ResolvedScenario } from "../scenario/normalize.ts";
-import { LogLinearHistogram } from "../metrics/histogram.ts";
 import type { SerializedHistogram } from "../metrics/histogram.ts";
 import { evaluateExpect } from "./expect/evaluate.ts";
 import { REPORT_SCHEMA_ID } from "./schema.ts";
@@ -149,7 +148,6 @@ function aggregateMeasurements(
     },
     server: aggregateServer(measurements.map((m) => m.server)),
     errors,
-    ...aggregateHistogram(measurements),
   };
 }
 
@@ -201,6 +199,7 @@ function aggregateSignatureVerification(
   for (const standard of standards) {
     byStandard[standard] = aggregatePartial(
       values.map((v) => v.byStandard?.[standard]),
+      "present",
     );
   }
   return {
@@ -230,11 +229,14 @@ type PartialMetric = {
   readonly p99?: number;
 };
 
-function aggregatePartial(values: readonly (PartialMetric | undefined)[]) {
+function aggregatePartial(
+  values: readonly (PartialMetric | undefined)[],
+  mode: "complete" | "present" = "complete",
+) {
   return {
-    ...partialField(values, "p50"),
-    ...partialField(values, "p95"),
-    ...partialField(values, "p99"),
+    ...partialField(values, "p50", mode),
+    ...partialField(values, "p95", mode),
+    ...partialField(values, "p99", mode),
   };
 }
 
@@ -242,8 +244,15 @@ function partialField(
   values:
     readonly ({ readonly [key: string]: number | undefined } | undefined)[],
   key: "p50" | "p95" | "p99",
+  mode: "complete" | "present",
 ): Record<typeof key, number> | Record<string, never> {
   const fieldValues = values.map((v) => v?.[key]);
+  if (mode === "present") {
+    const present = fieldValues.filter(isNumber);
+    return present.length < 1
+      ? {}
+      : { [key]: median(present) } as Record<typeof key, number>;
+  }
   return fieldValues.every(isNumber)
     ? { [key]: median(fieldValues) } as Record<typeof key, number>
     : {};
@@ -257,19 +266,6 @@ function hasPartial(value: {
   return value.p50 != null || value.p95 != null || value.p99 != null;
 }
 
-function aggregateHistogram(
-  measurements: readonly ScenarioMeasurement[],
-): { readonly histogram?: SerializedHistogram } {
-  const histograms = measurements.map((m) => m.histogram);
-  if (histograms.some((h) => h == null)) return {};
-  const [first, ...rest] = histograms as SerializedHistogram[];
-  const merged = LogLinearHistogram.fromJSON(first);
-  for (const histogram of rest) {
-    merged.merge(LogLinearHistogram.fromJSON(histogram));
-  }
-  return { histogram: merged.toJSON() };
-}
-
 function sumErrorBuckets(errors: readonly ErrorBucket[]): ErrorBucket[] {
   const buckets = new Map<string, ErrorBucket>();
   for (const error of errors) {

From 1d2edea8bee77e81bdaf873856d58578c0c152bb Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 18:15:28 +0900
Subject: [PATCH 15/19] Harden benchmark compare portability

Avoid Node 18-incompatible reverse iteration, let newly introduced
head-side metrics pass when the baseline lacks that measurement, and keep
temporary worktree directories intact until git owns them.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407603147
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407603149
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407603154

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 40 +++++++++++++++++++++++---
 packages/cli/src/bench/compare.ts      |  7 +++--
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index 572a2def5..98d29838a 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -518,6 +518,41 @@ test("buildCompareReport - missing client metrics fail comparisons", () => {
   assert.strictEqual(compare.passed, false);
 });
 
+test("buildCompareReport - missing baseline metrics pass comparisons", () => {
+  const signatureExpectation = (actual: number) =>
+    ({
+      metric: "signatureVerification.p95",
+      op: "lt",
+      threshold: 20,
+      unit: "ms",
+      actual,
+      severity: "fail",
+      pass: true,
+    }) as const;
+  const compare = buildCompareReport({
+    baseRef: "origin/main",
+    headRef: "HEAD",
+    baseReport: report([scenario({ server: null })]),
+    headReport: report([
+      scenario({
+        expectations: [signatureExpectation(12)],
+        server: {
+          signatureVerificationMs: {
+            overall: { p50: 6, p95: 12, p99: 28 },
+          },
+        },
+      }),
+    ]),
+    maxRegression: 0.1,
+    startedAt: "2026-06-13T00:00:00.000Z",
+    finishedAt: "2026-06-13T00:00:01.000Z",
+  });
+  assert.strictEqual(compare.comparisons[0].base, null);
+  assert.strictEqual(compare.comparisons[0].head, 12);
+  assert.strictEqual(compare.comparisons[0].pass, true);
+  assert.strictEqual(compare.passed, true);
+});
+
 test("startBenchmarkTarget - keeps target stdout off stdout", async () => {
   let options: SpawnOptions | undefined;
   const child = fakeChildProcess();
@@ -627,10 +662,7 @@ test("createBenchmarkWorktree - cleans partial registrations", async () => {
     ],
     ["worktree", "remove", "--force", "/tmp/fedify-bench-base-test"],
   ]);
-  assert.deepEqual(removals, [
-    "/tmp/fedify-bench-base-test",
-    "/tmp/fedify-bench-base-test",
-  ]);
+  assert.deepEqual(removals, ["/tmp/fedify-bench-base-test"]);
 });
 
 test("waitReadyUrl - does not wait for streaming response bodies", async () => {
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 7e8df29b8..535c99308 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -198,7 +198,8 @@ export async function runBenchCompare(
     log(describeError(error));
     return void exit(2);
   } finally {
-    for (const path of worktrees.toReversed()) {
+    for (let i = worktrees.length - 1; i >= 0; i--) {
+      const path = worktrees[i];
       try {
         await removeWorktree(path);
       } catch (error) {
@@ -354,7 +355,8 @@ function compareMetric(
     regression,
     noiseBand,
     allowedRegression,
-    pass: regression != null && regression <= allowedRegression,
+    pass: (base == null && head != null) ||
+      (regression != null && regression <= allowedRegression),
   };
 }
 
@@ -624,7 +626,6 @@ export async function createBenchmarkWorktree(
   const removePath = deps.removePath ?? rm;
   const run = deps.runGit ?? runGit;
   const path = await createTempDir(join(tmpdir(), `fedify-bench-${label}-`));
-  await removePath(path, { recursive: true, force: true });
   try {
     await run(["worktree", "add", "--detach", path, ref]);
   } catch (error) {

From 41d38f42bc95d5a9a1f3f41324e0c130b7067f11 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 18:58:49 +0900
Subject: [PATCH 16/19] Harden benchmark target cleanup

Race target exits against readiness, make interrupted comparisons stop
the active target and clean temporary worktrees, and reject cleanly if
forced target termination throws.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407756635
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407756637
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407762309

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 112 +++++++++++++++++
 packages/cli/src/bench/compare.ts      | 166 ++++++++++++++++++++++---
 2 files changed, 260 insertions(+), 18 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index 98d29838a..33a87c2f9 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -617,6 +617,24 @@ test("stopTargetProcess - rejects when forced kill does not exit", async () => {
   );
 });
 
+test("stopTargetProcess - rejects when forced kill throws", async () => {
+  const child = fakeChildProcess(4321);
+  child.kill = () => true;
+  await assert.rejects(
+    stopTargetProcess(child, {
+      platform: "win32",
+      forceTimeoutMs: 1,
+      forceKillTimeoutMs: 10,
+      killWindowsProcessTree: (_pid, signal) => {
+        if (signal === "SIGKILL") {
+          throw new Error("forced kill failed");
+        }
+      },
+    }),
+    /forced kill failed/,
+  );
+});
+
 test("stopTargetProcess - resolves immediately without a pid", async () => {
   const child = fakeChildProcess();
   Object.defineProperty(child, "pid", { value: undefined });
@@ -790,6 +808,100 @@ test("runBenchCompare - orchestrates worktrees and cleans up", async () => {
   ]);
 });
 
+test("runBenchCompare - stops target and removes worktree on interrupt", async () => {
+  const signals = new EventEmitter();
+  const events: string[] = [];
+  let code = -1;
+  await runBenchCompare(command({}), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: () => {
+      events.push("write");
+      return Promise.resolve();
+    },
+    log: (message) => events.push(`log:${message}`),
+    createWorktree: (_ref, label) => Promise.resolve(`/tmp/${label}`),
+    removeWorktree: (path) => {
+      events.push(`remove:${path}`);
+      return Promise.resolve();
+    },
+    startTarget: (cwd) => {
+      events.push(`start:${cwd}`);
+      queueMicrotask(() => signals.emit("SIGINT", "SIGINT"));
+      return Promise.resolve({
+        stop: () => {
+          events.push(`stop:${cwd}`);
+          return Promise.resolve();
+        },
+      });
+    },
+    waitReady: () => new Promise(() => {}),
+    runBenchInWorktree: () => {
+      events.push("bench");
+      return Promise.resolve(report([scenario()]));
+    },
+    signalTarget: signals,
+  });
+  assert.strictEqual(code, 130);
+  assert.deepEqual(events, [
+    "log:Checking out base benchmark ref origin/main…",
+    "start:/tmp/base",
+    "stop:/tmp/base",
+    "remove:/tmp/base",
+  ]);
+  assert.strictEqual(signals.listenerCount("SIGINT"), 0);
+  assert.strictEqual(signals.listenerCount("SIGTERM"), 0);
+});
+
+test("runBenchCompare - fails when target exits before readiness", async () => {
+  const events: string[] = [];
+  let code = -1;
+  await runBenchCompare(command({}), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: () => {
+      events.push("write");
+      return Promise.resolve();
+    },
+    log: (message) => events.push(`log:${message}`),
+    createWorktree: (_ref, label) => Promise.resolve(`/tmp/${label}`),
+    removeWorktree: (path) => {
+      events.push(`remove:${path}`);
+      return Promise.resolve();
+    },
+    startTarget: (cwd) => {
+      events.push(`start:${cwd}`);
+      return Promise.resolve({
+        exited: Promise.reject(new Error("target exited early")),
+        stop: () => {
+          events.push(`stop:${cwd}`);
+          return Promise.resolve();
+        },
+      });
+    },
+    waitReady: () => {
+      events.push("ready");
+      return Promise.resolve();
+    },
+    runBenchInWorktree: () => {
+      events.push("bench");
+      return Promise.resolve(report([scenario()]));
+    },
+  });
+  assert.strictEqual(code, 2);
+  assert.match(events.join("\n"), /target exited early/);
+  assert.deepEqual(events, [
+    "log:Checking out base benchmark ref origin/main…",
+    "start:/tmp/base",
+    "ready",
+    "stop:/tmp/base",
+    "log:target exited early",
+    "remove:/tmp/base",
+  ]);
+});
+
 test("runBenchCompare - does not treat derived target as explicit", async () => {
   const file = await writeSuite(`version: 1
 target: https://example.com
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 535c99308..45689357b 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -79,10 +79,12 @@ export interface RunBenchCompareDeps {
     input: RunBenchInWorktreeInput,
   ) => Promise<BenchReport>;
   readonly benchDeps?: RunBenchDeps;
+  readonly signalTarget?: SignalTarget;
 }
 
 /** A started target process. */
 export interface StartedTarget {
+  readonly exited?: Promise<never>;
   stop(): Promise<void>;
 }
 
@@ -106,6 +108,14 @@ type BenchRunCompareCommand = BenchRunCommand & {
   readonly explicitCliTarget: boolean;
 };
 
+type BenchmarkSignal = "SIGINT" | "SIGTERM";
+type SignalListener = (signal: BenchmarkSignal) => void;
+
+interface SignalTarget {
+  on(signal: BenchmarkSignal, listener: SignalListener): unknown;
+  off(signal: BenchmarkSignal, listener: SignalListener): unknown;
+}
+
 /** Options for starting a benchmark target. */
 export interface StartBenchmarkTargetOptions {
   readonly platform?: NodeJS.Platform;
@@ -162,6 +172,7 @@ export async function runBenchCompare(
   const waitReady = deps.waitReady ?? defaultWaitReady;
   const runBenchInWorktree = deps.runBenchInWorktree ??
     ((input) => defaultRunBenchInWorktree(input, deps.benchDeps));
+  const signalTarget = deps.signalTarget ?? process;
 
   let readyUrl: URL;
   let readyTimeoutMs: number;
@@ -177,9 +188,27 @@ export async function runBenchCompare(
   const target = command.target ?? new URL("/", readyUrl).origin;
   const worktrees: string[] = [];
   const startedAt = new Date().toISOString();
+  let activeTarget: StartedTarget | undefined;
+  let interruptError: BenchmarkInterrupted | undefined;
+  let rejectInterrupt!: (error: BenchmarkInterrupted) => void;
+  const interruptPromise = new Promise<never>((_resolve, reject) => {
+    rejectInterrupt = reject;
+  });
+  void interruptPromise.catch(() => {});
+  let interrupted = false;
+  const onSignal: SignalListener = (signal) => {
+    if (interrupted) return;
+    interrupted = true;
+    interruptError = new BenchmarkInterrupted(signal);
+    rejectInterrupt(interruptError);
+  };
+  signalTarget.on("SIGINT", onSignal);
+  signalTarget.on("SIGTERM", onSignal);
   try {
     const baseReport = await runSide("base", command.base);
+    throwIfInterrupted();
     const headReport = await runSide("head", command.head);
+    throwIfInterrupted();
     const report = buildCompareReport({
       baseRef: command.base,
       headRef: command.head,
@@ -189,15 +218,29 @@ export async function runBenchCompare(
       startedAt,
       finishedAt: new Date().toISOString(),
     });
-    await writeOutput(
+    throwIfInterrupted();
+    await withInterrupt(writeOutput(
       renderCompareReport(report, command.format),
       command.output,
-    );
+    ));
+    throwIfInterrupted();
     return void exit(report.passed ? 0 : 1);
   } catch (error) {
+    if (error instanceof BenchmarkInterrupted) {
+      return void exit(error.exitCode);
+    }
     log(describeError(error));
     return void exit(2);
   } finally {
+    if (activeTarget != null) {
+      try {
+        await activeTarget.stop();
+      } catch (error) {
+        log(`Failed to stop benchmark target: ${describeError(error)}`);
+      } finally {
+        activeTarget = undefined;
+      }
+    }
     for (let i = worktrees.length - 1; i >= 0; i--) {
       const path = worktrees[i];
       try {
@@ -210,6 +253,8 @@ export async function runBenchCompare(
         );
       }
     }
+    signalTarget.off("SIGINT", onSignal);
+    signalTarget.off("SIGTERM", onSignal);
   }
 
   async function runSide(
@@ -219,14 +264,53 @@ export async function runBenchCompare(
     log(`Checking out ${label} benchmark ref ${ref}…`);
     const cwd = await createWorktree(ref, label);
     worktrees.push(cwd);
+    throwIfInterrupted();
     const targetProcess = await startTarget(cwd, command.startCommand);
+    activeTarget = targetProcess;
     try {
-      await waitReady(readyUrl, readyTimeoutMs);
-      return await runBenchInWorktree({ cwd, command, target });
+      throwIfInterrupted();
+      await withInterrupt(
+        Promise.race([
+          targetProcessExited(targetProcess),
+          waitReady(readyUrl, readyTimeoutMs),
+        ]),
+      );
+      return await withInterrupt(
+        Promise.race([
+          targetProcessExited(targetProcess),
+          runBenchInWorktree({ cwd, command, target }),
+        ]),
+      );
     } finally {
-      await targetProcess.stop();
+      try {
+        await targetProcess.stop();
+      } finally {
+        if (activeTarget === targetProcess) activeTarget = undefined;
+      }
     }
   }
+
+  function withInterrupt<T>(promise: Promise<T>): Promise<T> {
+    return Promise.race([interruptPromise, promise]);
+  }
+
+  function throwIfInterrupted(): void {
+    if (interruptError != null) throw interruptError;
+  }
+}
+
+class BenchmarkInterrupted extends Error {
+  constructor(readonly signal: BenchmarkSignal) {
+    super(`Interrupted by ${signal}.`);
+  }
+
+  get exitCode(): number {
+    return this.signal === "SIGINT" ? 130 : 143;
+  }
+}
+
+function targetProcessExited(target: StartedTarget): Promise<never> {
+  return target.exited ?? new Promise<never>(() => {});
 }
 
 /** Parses `--max-regression`, accepting ratios or percentages. */
@@ -683,7 +767,12 @@ export function startBenchmarkTarget(
     env: process.env,
   });
   forwardTargetOutput(child, stderr);
-  return { stop: () => stopTargetProcess(child, { platform }) };
+  const exited = createTargetExitPromise(child);
+  void exited.catch(() => {});
+  return {
+    exited,
+    stop: () => stopTargetProcess(child, { platform }),
+  };
 }
 
 function forwardTargetOutput(child: ChildProcess, stderr: ProcessOutput): void {
@@ -695,6 +784,29 @@ function forwardTargetOutput(child: ChildProcess, stderr: ProcessOutput): void {
   });
 }
 
+function createTargetExitPromise(child: ChildProcess): Promise<never> {
+  return new Promise<never>((_resolve, reject) => {
+    const onError = (error: Error) => {
+      child.removeListener("exit", onExit);
+      reject(error);
+    };
+    const onExit = (code: number | null, signal: NodeJS.Signals | null) => {
+      child.removeListener("error", onError);
+      const suffix = signal == null
+        ? ` with code ${code ?? "<unknown>"}`
+        : ` from ${signal}`;
+      reject(
+        new Error(
+          `Benchmark target process ${child.pid ?? "<unknown>"} exited` +
+            `${suffix} before benchmark completion.`,
+        ),
+      );
+    };
+    child.once("error", onError);
+    child.once("exit", onExit);
+  });
+}
+
 /** Stops a benchmark target process. */
 export function stopTargetProcess(
   child: ChildProcess,
@@ -714,24 +826,38 @@ export function stopTargetProcess(
       resolve();
       return;
     }
+    let settled = false;
     let forceKillTimer: ReturnType<typeof setTimeout> | undefined;
     const clearTimers = () => {
       clearTimeout(forceTimer);
       if (forceKillTimer != null) clearTimeout(forceKillTimer);
     };
     const onExit = () => {
+      if (settled) return;
+      settled = true;
       clearTimers();
       resolve();
     };
+    const rejectStop = (error: unknown) => {
+      if (settled) return;
+      settled = true;
+      clearTimers();
+      child.removeListener("exit", onExit);
+      reject(error);
+    };
     const forceTimer = setTimeout(() => {
-      killTargetProcess(child, "SIGKILL", {
-        platform,
-        killWindowsProcessTree,
-        killProcessGroup,
-      });
+      try {
+        killTargetProcess(child, "SIGKILL", {
+          platform,
+          killWindowsProcessTree,
+          killProcessGroup,
+        });
+      } catch (error) {
+        rejectStop(error);
+        return;
+      }
       forceKillTimer = setTimeout(() => {
-        child.removeListener("exit", onExit);
-        reject(
+        rejectStop(
           new Error(
             `Benchmark target process ${child.pid ?? "<unknown>"} ` +
               "did not exit after SIGKILL.",
@@ -740,11 +866,15 @@ export function stopTargetProcess(
       }, forceKillTimeoutMs);
     }, forceTimeoutMs);
     child.once("exit", onExit);
-    killTargetProcess(child, "SIGTERM", {
-      platform,
-      killWindowsProcessTree,
-      killProcessGroup,
-    });
+    try {
+      killTargetProcess(child, "SIGTERM", {
+        platform,
+        killWindowsProcessTree,
+        killProcessGroup,
+      });
+    } catch (error) {
+      rejectStop(error);
+    }
   });
 }
 

From 356539c6c871384d14c5af53da2a20e5fa307a91 Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Sat, 13 Jun 2026 20:29:48 +0900
Subject: [PATCH 17/19] Cancel interrupted benchmark work

Carry interrupt and target-exit cancellation through compare readiness,
in-process benchmark runs, runner fetches, and load-generator sleeps so
raced benchmark work does not continue after cleanup starts.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3407801291
https://github.com/fedify-dev/fedify/pull/804#discussion_r3407804829

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/action.ts              |  36 ++++-
 packages/cli/src/bench/compare.test.ts        | 124 +++++++++++++++++-
 packages/cli/src/bench/compare.ts             |  90 +++++++++++--
 packages/cli/src/bench/load/clock.ts          |  25 +++-
 packages/cli/src/bench/load/generator.test.ts |  18 +++
 packages/cli/src/bench/load/generator.ts      |  54 ++++++--
 packages/cli/src/bench/result/build.ts        |   5 +-
 packages/cli/src/bench/scenarios/failure.ts   |   1 +
 packages/cli/src/bench/scenarios/fanout.ts    |   1 +
 packages/cli/src/bench/scenarios/inbox.ts     |   1 +
 packages/cli/src/bench/scenarios/read.ts      |   1 +
 packages/cli/src/bench/scenarios/runner.ts    |   2 +
 packages/cli/src/bench/scenarios/webfinger.ts |   1 +
 13 files changed, 334 insertions(+), 25 deletions(-)

diff --git a/packages/cli/src/bench/action.ts b/packages/cli/src/bench/action.ts
index c100d22b0..401e36d00 100644
--- a/packages/cli/src/bench/action.ts
+++ b/packages/cli/src/bench/action.ts
@@ -70,6 +70,8 @@ export interface RunBenchDeps {
   readonly fetch?: typeof fetch;
   /** Hostname resolver used for target risk classification. */
   readonly resolveTargetAddresses?: ResolveTargetAddresses;
+  /** Aborts in-flight benchmark work. */
+  readonly signal?: AbortSignal;
 }
 
 /**
@@ -95,8 +97,13 @@ export default async function runBench(
   // Apply the configured User-Agent to all benchmark traffic — the probe, the
   // stats reads, and the runners' inbox/WebFinger requests — not just the
   // document loader, so a target that inspects the UA sees it on every request.
-  const fetchImpl = withUserAgent(deps.fetch ?? fetch, command.userAgent);
+  const signal = deps.signal;
+  const fetchImpl = withUserAgent(
+    withAbortSignal(deps.fetch ?? fetch, signal),
+    command.userAgent,
+  );
   const explicitCliTarget = command.explicitCliTarget ?? command.target != null;
+  throwIfAborted(signal);
 
   // Loading, validation, and normalization failures are all user-facing
   // configuration errors.
@@ -111,6 +118,7 @@ export default async function runBench(
     log(describeError(error));
     return void exit(2);
   }
+  throwIfAborted(signal);
 
   // Preflight every runner so an unsupported scenario type, an option the
   // runner cannot honor, or a malformed `expect` assertion fails fast, before
@@ -130,12 +138,15 @@ export default async function runBench(
     log(describeError(error));
     return void exit(2);
   }
+  throwIfAborted(signal);
 
   const tier = await classifyResolvedTarget(
     suite.target,
     deps.resolveTargetAddresses,
   );
+  throwIfAborted(signal);
   const probe = await probeBenchmarkMode(suite.target, fetchImpl);
+  throwIfAborted(signal);
   try {
     if (!command.dryRun) {
       assertUnsafeOverrideAllowed({
@@ -293,6 +304,7 @@ export default async function runBench(
   let fleet: SyntheticServer | undefined;
   const startedAt = new Date().toISOString();
   try {
+    throwIfAborted(signal);
     if (
       suite.scenarios.some((scenario) =>
         scenarioNeedsSyntheticServer(scenario, suite.scenarios)
@@ -307,6 +319,7 @@ export default async function runBench(
       const scenario = suite.scenarios[i];
       const measurements: ScenarioMeasurement[] = [];
       for (let run = 1; run <= scenario.runs; run++) {
+        throwIfAborted(signal);
         const suffix = scenario.runs === 1
           ? ""
           : ` run ${run}/${scenario.runs}`;
@@ -328,8 +341,10 @@ export default async function runBench(
               assertReadDestinationAllowed(url, gateScenario ?? scenario),
             assertActorlessDestinationAllowed: (url, gateScenario) =>
               assertActorlessDestinationAllowed(url, gateScenario ?? scenario),
+            signal,
           }),
         );
+        throwIfAborted(signal);
       }
       results.push(buildScenarioResult(scenario, measurements));
     }
@@ -406,6 +421,25 @@ export function withUserAgent(
   }) as typeof fetch;
 }
 
+function withAbortSignal(
+  fetchImpl: typeof fetch,
+  signal: AbortSignal | undefined,
+): typeof fetch {
+  if (signal == null) return fetchImpl;
+  return ((input: URL | RequestInfo, init?: RequestInit) => {
+    if (signal.aborted) return Promise.reject(abortReason(signal));
+    return fetchImpl(input, { ...init, signal });
+  }) as typeof fetch;
+}
+
+function throwIfAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) throw abortReason(signal);
+}
+
+function abortReason(signal: AbortSignal): unknown {
+  return signal.reason ?? new Error("Benchmark run aborted.");
+}
+
 async function defaultWriteOutput(
   content: string,
   outputPath: string | undefined,
diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index 33a87c2f9..f48e2ccd1 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -828,7 +828,6 @@ test("runBenchCompare - stops target and removes worktree on interrupt", async (
     },
     startTarget: (cwd) => {
       events.push(`start:${cwd}`);
-      queueMicrotask(() => signals.emit("SIGINT", "SIGINT"));
       return Promise.resolve({
         stop: () => {
           events.push(`stop:${cwd}`);
@@ -836,7 +835,16 @@ test("runBenchCompare - stops target and removes worktree on interrupt", async (
         },
       });
     },
-    waitReady: () => new Promise(() => {}),
+    waitReady: (_url, _timeoutMs, signal) => {
+      assert.ok(signal);
+      return new Promise((_resolve, reject) => {
+        signal.addEventListener("abort", () => {
+          events.push("ready-abort");
+          reject(signal.reason);
+        }, { once: true });
+        queueMicrotask(() => signals.emit("SIGINT", "SIGINT"));
+      });
+    },
     runBenchInWorktree: () => {
       events.push("bench");
       return Promise.resolve(report([scenario()]));
@@ -847,6 +855,7 @@ test("runBenchCompare - stops target and removes worktree on interrupt", async (
   assert.deepEqual(events, [
     "log:Checking out base benchmark ref origin/main…",
     "start:/tmp/base",
+    "ready-abort",
     "stop:/tmp/base",
     "remove:/tmp/base",
   ]);
@@ -854,6 +863,117 @@ test("runBenchCompare - stops target and removes worktree on interrupt", async (
   assert.strictEqual(signals.listenerCount("SIGTERM"), 0);
 });
 
+test("runBenchCompare - aborts raced benchmark work on interrupt", async () => {
+  const signals = new EventEmitter();
+  const events: string[] = [];
+  let code = -1;
+  await runBenchCompare(command({}), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: () => {
+      events.push("write");
+      return Promise.resolve();
+    },
+    log: (message) => events.push(`log:${message}`),
+    createWorktree: (_ref, label) => Promise.resolve(`/tmp/${label}`),
+    removeWorktree: (path) => {
+      events.push(`remove:${path}`);
+      return Promise.resolve();
+    },
+    startTarget: (cwd) => {
+      events.push(`start:${cwd}`);
+      return Promise.resolve({
+        stop: () => {
+          events.push(`stop:${cwd}`);
+          return Promise.resolve();
+        },
+      });
+    },
+    waitReady: () => {
+      events.push("ready");
+      return Promise.resolve();
+    },
+    runBenchInWorktree: ({ signal }) => {
+      assert.ok(signal);
+      queueMicrotask(() => signals.emit("SIGTERM", "SIGTERM"));
+      return new Promise((_resolve, reject) => {
+        signal.addEventListener("abort", () => {
+          events.push("bench-abort");
+          reject(signal.reason);
+        }, { once: true });
+      });
+    },
+    signalTarget: signals,
+  });
+  assert.strictEqual(code, 143);
+  assert.deepEqual(events, [
+    "log:Checking out base benchmark ref origin/main…",
+    "start:/tmp/base",
+    "ready",
+    "bench-abort",
+    "stop:/tmp/base",
+    "remove:/tmp/base",
+  ]);
+});
+
+test("runBenchCompare - aborts benchmark work when target exits", async () => {
+  const events: string[] = [];
+  let rejectExit!: (error: Error) => void;
+  let code = -1;
+  await runBenchCompare(command({}), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: () => {
+      events.push("write");
+      return Promise.resolve();
+    },
+    log: (message) => events.push(`log:${message}`),
+    createWorktree: (_ref, label) => Promise.resolve(`/tmp/${label}`),
+    removeWorktree: (path) => {
+      events.push(`remove:${path}`);
+      return Promise.resolve();
+    },
+    startTarget: (cwd) => {
+      events.push(`start:${cwd}`);
+      return Promise.resolve({
+        exited: new Promise<never>((_resolve, reject) => {
+          rejectExit = reject;
+        }),
+        stop: () => {
+          events.push(`stop:${cwd}`);
+          return Promise.resolve();
+        },
+      });
+    },
+    waitReady: () => {
+      events.push("ready");
+      return Promise.resolve();
+    },
+    runBenchInWorktree: ({ signal }) => {
+      assert.ok(signal);
+      queueMicrotask(() => rejectExit(new Error("target exited")));
+      return new Promise((_resolve, reject) => {
+        signal.addEventListener("abort", () => {
+          events.push("bench-abort");
+          reject(signal.reason);
+        }, { once: true });
+      });
+    },
+  });
+  assert.strictEqual(code, 2);
+  assert.deepEqual(events, [
+    "log:Checking out base benchmark ref origin/main…",
+    "start:/tmp/base",
+    "ready",
+    "bench-abort",
+    "stop:/tmp/base",
+    "log:target exited",
+    "remove:/tmp/base",
+  ]);
+});
+
 test("runBenchCompare - fails when target exits before readiness", async () => {
   const events: string[] = [];
   let code = -1;
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 45689357b..3c442d3c3 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -74,7 +74,11 @@ export interface RunBenchCompareDeps {
     cwd: string,
     startCommand: string,
   ) => Promise<StartedTarget>;
-  readonly waitReady?: (url: URL, timeoutMs: number) => Promise<void>;
+  readonly waitReady?: (
+    url: URL,
+    timeoutMs: number,
+    signal?: AbortSignal,
+  ) => Promise<void>;
   readonly runBenchInWorktree?: (
     input: RunBenchInWorktreeInput,
   ) => Promise<BenchReport>;
@@ -93,6 +97,7 @@ export interface RunBenchInWorktreeInput {
   readonly cwd: string;
   readonly command: BenchCompareCommand;
   readonly target: string;
+  readonly signal?: AbortSignal;
 }
 
 type ProcessOutput = {
@@ -138,7 +143,8 @@ export interface StopTargetProcessOptions {
 /** Dependencies for waiting until a benchmark target is ready. */
 export interface WaitReadyUrlDeps {
   readonly fetch?: typeof fetch;
-  readonly sleep?: (ms: number) => Promise<void>;
+  readonly sleep?: (ms: number, signal?: AbortSignal) => Promise<void>;
+  readonly signal?: AbortSignal;
 }
 
 type CreateTempDir = (prefix: string) => Promise<string>;
@@ -190,6 +196,8 @@ export async function runBenchCompare(
   const startedAt = new Date().toISOString();
   let activeTarget: StartedTarget | undefined;
   let interruptError: BenchmarkInterrupted | undefined;
+  const interruptController = new AbortController();
+  const interruptSignal = interruptController.signal;
   let rejectInterrupt!: (error: BenchmarkInterrupted) => void;
   const interruptPromise = new Promise<never>((_resolve, reject) => {
     rejectInterrupt = reject;
@@ -200,6 +208,7 @@ export async function runBenchCompare(
     if (interrupted) return;
     interrupted = true;
     interruptError = new BenchmarkInterrupted(signal);
+    interruptController.abort(interruptError);
     rejectInterrupt(interruptError);
   };
   signalTarget.on("SIGINT", onSignal);
@@ -267,22 +276,42 @@ export async function runBenchCompare(
     throwIfInterrupted();
     const targetProcess = await startTarget(cwd, command.startCommand);
     activeTarget = targetProcess;
+    let stoppingTarget = false;
+    let targetExitError: unknown;
+    const targetExit = targetProcessExited(targetProcess).catch((error) => {
+      targetExitError = error;
+      if (!stoppingTarget && !interruptSignal.aborted) {
+        interruptController.abort(error);
+      }
+      throw error;
+    });
+    const throwIfTargetExited = () => {
+      if (targetExitError != null) throw targetExitError;
+    };
     try {
       throwIfInterrupted();
       await withInterrupt(
         Promise.race([
-          targetProcessExited(targetProcess),
-          waitReady(readyUrl, readyTimeoutMs),
+          targetExit,
+          waitReady(readyUrl, readyTimeoutMs, interruptSignal),
         ]),
       );
+      await Promise.resolve();
+      throwIfTargetExited();
       return await withInterrupt(
         Promise.race([
-          targetProcessExited(targetProcess),
-          runBenchInWorktree({ cwd, command, target }),
+          targetExit,
+          runBenchInWorktree({
+            cwd,
+            command,
+            target,
+            signal: interruptSignal,
+          }),
         ]),
       );
     } finally {
       try {
+        stoppingTarget = true;
         await targetProcess.stop();
       } finally {
         if (activeTarget === targetProcess) activeTarget = undefined;
@@ -679,6 +708,7 @@ async function defaultRunBenchInWorktree(
   };
   await runBench(runCommand, {
     ...benchDeps,
+    signal: input.signal,
     exit: (code) => {
       exitCode = code;
     },
@@ -928,8 +958,12 @@ export function windowsTaskkillArgs(
   return args;
 }
 
-async function defaultWaitReady(url: URL, timeoutMs: number): Promise<void> {
-  return await waitReadyUrl(url, timeoutMs);
+async function defaultWaitReady(
+  url: URL,
+  timeoutMs: number,
+  signal?: AbortSignal,
+): Promise<void> {
+  return await waitReadyUrl(url, timeoutMs, { signal });
 }
 
 /** Waits until a benchmark target readiness URL responds successfully. */
@@ -940,13 +974,19 @@ export async function waitReadyUrl(
 ): Promise<void> {
   const fetchReady = deps.fetch ?? fetch;
   const sleep = deps.sleep ??
-    ((ms) => new Promise<void>((resolve) => setTimeout(resolve, ms)));
+    ((ms, signal) => abortableSleep(ms, signal));
+  const signal = deps.signal;
   const deadline = Date.now() + timeoutMs;
   let lastError: unknown;
   while (Date.now() <= deadline) {
+    throwIfAborted(signal);
     const remainingMs = deadline - Date.now();
     if (remainingMs <= 0) break;
     const controller = new AbortController();
+    const onAbort = () => {
+      controller.abort(abortReason(signal!));
+    };
+    signal?.addEventListener("abort", onAbort, { once: true });
     const timer = setTimeout(() => {
       controller.abort(new Error(`ready URL timed out after ${timeoutMs}ms`));
     }, remainingMs);
@@ -956,22 +996,52 @@ export async function waitReadyUrl(
       if (response.status >= 200 && response.status < 400) return;
       lastError = new Error(`ready URL returned ${response.status}`);
     } catch (error) {
+      if (signal?.aborted) throw abortReason(signal);
       if (controller.signal.aborted) {
         lastError = controller.signal.reason ?? error;
         break;
       }
       lastError = error;
     } finally {
+      signal?.removeEventListener("abort", onAbort);
       clearTimeout(timer);
     }
     const delayMs = Math.min(250, deadline - Date.now());
-    if (delayMs > 0) await sleep(delayMs);
+    if (delayMs > 0) await sleep(delayMs, signal);
   }
   throw new Error(
     `Timed out waiting for ${url.href}: ${describeError(lastError)}.`,
   );
 }
 
+function abortableSleep(ms: number, signal?: AbortSignal): Promise<void> {
+  if (signal?.aborted) return Promise.reject(abortReason(signal));
+  if (ms <= 0) return Promise.resolve();
+  return new Promise((resolve, reject) => {
+    const timer = setTimeout(() => {
+      cleanup();
+      resolve();
+    }, ms);
+    const onAbort = () => {
+      clearTimeout(timer);
+      cleanup();
+      reject(abortReason(signal!));
+    };
+    const cleanup = () => {
+      signal?.removeEventListener("abort", onAbort);
+    };
+    signal?.addEventListener("abort", onAbort, { once: true });
+  });
+}
+
+function throwIfAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) throw abortReason(signal);
+}
+
+function abortReason(signal: AbortSignal): unknown {
+  return signal.reason ?? new Error("Benchmark comparison aborted.");
+}
+
 async function defaultWriteOutput(
   content: string,
   outputPath: string | undefined,
diff --git a/packages/cli/src/bench/load/clock.ts b/packages/cli/src/bench/load/clock.ts
index 848e0bdce..95eaf2d0e 100644
--- a/packages/cli/src/bench/load/clock.ts
+++ b/packages/cli/src/bench/load/clock.ts
@@ -10,17 +10,36 @@ export interface Clock {
   /** The current time in milliseconds (monotonic, not wall-clock). */
   now(): number;
   /** Resolves once the clock reaches `timeMs` (or immediately if already past). */
-  sleepUntil(timeMs: number): Promise<void>;
+  sleepUntil(timeMs: number, signal?: AbortSignal): Promise<void>;
 }
 
 /** Returns a clock backed by `performance.now()` and `setTimeout`. */
 export function systemClock(): Clock {
   return {
     now: () => performance.now(),
-    sleepUntil(timeMs: number): Promise<void> {
+    sleepUntil(timeMs: number, signal?: AbortSignal): Promise<void> {
+      if (signal?.aborted) return Promise.reject(abortReason(signal));
       const remaining = timeMs - performance.now();
       if (remaining <= 0) return Promise.resolve();
-      return new Promise((resolve) => setTimeout(resolve, remaining));
+      return new Promise((resolve, reject) => {
+        const timer = setTimeout(() => {
+          cleanup();
+          resolve();
+        }, remaining);
+        const onAbort = () => {
+          clearTimeout(timer);
+          cleanup();
+          reject(abortReason(signal!));
+        };
+        const cleanup = () => {
+          signal?.removeEventListener("abort", onAbort);
+        };
+        signal?.addEventListener("abort", onAbort, { once: true });
+      });
     },
   };
 }
+
+function abortReason(signal: AbortSignal): unknown {
+  return signal.reason ?? new Error("Operation aborted.");
+}
diff --git a/packages/cli/src/bench/load/generator.test.ts b/packages/cli/src/bench/load/generator.test.ts
index c23567f26..9ce56aa97 100644
--- a/packages/cli/src/bench/load/generator.test.ts
+++ b/packages/cli/src/bench/load/generator.test.ts
@@ -153,3 +153,21 @@ test("runLoad - records send exceptions as failed samples", async () => {
   assert.ok(result.samples.every((s) => !s.outcome.ok));
   assert.ok(result.samples.every((s) => s.outcome.errorKind === "exception"));
 });
+
+test("runLoad - aborts scheduled sleeps", async () => {
+  const controller = new AbortController();
+  const startedAt = Date.now();
+  const load = runLoad(
+    {
+      load: { kind: "open", ratePerSec: 1, arrival: "constant" },
+      durationMs: 10_000,
+      warmupMs: 0,
+    },
+    () => Promise.resolve(ok),
+    undefined,
+    controller.signal,
+  );
+  setTimeout(() => controller.abort(new Error("cancelled")), 10);
+  await assert.rejects(load, /cancelled/);
+  assert.ok(Date.now() - startedAt < 1000);
+});
diff --git a/packages/cli/src/bench/load/generator.ts b/packages/cli/src/bench/load/generator.ts
index 7b05caae1..a4744a9a3 100644
--- a/packages/cli/src/bench/load/generator.ts
+++ b/packages/cli/src/bench/load/generator.ts
@@ -72,10 +72,11 @@ export function runLoad(
   plan: LoadPlan,
   send: SendFunction,
   clock: Clock = systemClock(),
+  signal?: AbortSignal,
 ): Promise<LoadResult> {
   return plan.load.kind === "open"
-    ? runOpenLoop(plan, plan.load, send, clock)
-    : runClosedLoop(plan, plan.load, send, clock);
+    ? runOpenLoop(plan, plan.load, send, clock, signal)
+    : runClosedLoop(plan, plan.load, send, clock, signal);
 }
 
 async function runOpenLoop(
@@ -83,6 +84,7 @@ async function runOpenLoop(
   load: Extract<LoadModel, { kind: "open" }>,
   send: SendFunction,
   clock: Clock,
+  signal: AbortSignal | undefined,
 ): Promise<LoadResult> {
   const arrivals = scheduleArrivals({
     ratePerSec: load.ratePerSec,
@@ -98,8 +100,13 @@ async function runOpenLoop(
   // bounded by the in-flight count rather than the total request count.
   const active = new Set<Promise<void>>();
   for (const offset of arrivals) {
-    await clock.sleepUntil(start + offset);
-    if (await slots.acquire()) saturated = true;
+    throwIfAborted(signal);
+    await clock.sleepUntil(start + offset, signal);
+    if (await slots.acquire(signal)) saturated = true;
+    if (signal?.aborted) {
+      slots.release();
+      throw abortReason(signal);
+    }
     const dispatched = dispatch(
       send,
       offset,
@@ -123,6 +130,7 @@ async function runClosedLoop(
   load: Extract<LoadModel, { kind: "closed" }>,
   send: SendFunction,
   clock: Clock,
+  signal: AbortSignal | undefined,
 ): Promise<LoadResult> {
   const samples: Sample[] = [];
   const slots = createSemaphore(load.maxInFlight);
@@ -131,7 +139,12 @@ async function runClosedLoop(
   const deadline = start + plan.durationMs;
   async function worker(): Promise<void> {
     while (clock.now() < deadline) {
-      if (await slots.acquire()) saturated = true;
+      throwIfAborted(signal);
+      if (await slots.acquire(signal)) saturated = true;
+      if (signal?.aborted) {
+        slots.release();
+        throw abortReason(signal);
+      }
       if (clock.now() >= deadline) {
         slots.release();
         break;
@@ -176,7 +189,7 @@ async function dispatch(
 
 interface Semaphore {
   /** Acquires a slot; resolves `true` if it had to wait (backpressure). */
-  acquire(): Promise<boolean>;
+  acquire(signal?: AbortSignal): Promise<boolean>;
   /** Releases a slot, transferring it to the next waiter if any. */
   release(): void;
 }
@@ -188,7 +201,8 @@ function createSemaphore(max: number | undefined): Semaphore {
   let count = 0;
   const queue: Array<() => void> = [];
   return {
-    acquire(): Promise<boolean> {
+    acquire(signal?: AbortSignal): Promise<boolean> {
+      throwIfAborted(signal);
       if (count < max) {
         count++;
         return Promise.resolve(false);
@@ -196,7 +210,23 @@ function createSemaphore(max: number | undefined): Semaphore {
       // Wait in FIFO order; release() transfers the slot to us directly
       // (count is not decremented), so an active worker cannot barge ahead of
       // a queued one.
-      return new Promise<boolean>((resolve) => queue.push(() => resolve(true)));
+      return new Promise<boolean>((resolve, reject) => {
+        const waiter = () => {
+          cleanup();
+          resolve(true);
+        };
+        const onAbort = () => {
+          const index = queue.indexOf(waiter);
+          if (index >= 0) queue.splice(index, 1);
+          cleanup();
+          reject(abortReason(signal!));
+        };
+        const cleanup = () => {
+          signal?.removeEventListener("abort", onAbort);
+        };
+        signal?.addEventListener("abort", onAbort, { once: true });
+        queue.push(waiter);
+      });
     },
     release(): void {
       const next = queue.shift();
@@ -205,3 +235,11 @@ function createSemaphore(max: number | undefined): Semaphore {
     },
   };
 }
+
+function throwIfAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) throw abortReason(signal);
+}
+
+function abortReason(signal: AbortSignal): unknown {
+  return signal.reason ?? new Error("Benchmark load aborted.");
+}
diff --git a/packages/cli/src/bench/result/build.ts b/packages/cli/src/bench/result/build.ts
index d5e3592f3..7fee0ba85 100644
--- a/packages/cli/src/bench/result/build.ts
+++ b/packages/cli/src/bench/result/build.ts
@@ -254,7 +254,10 @@ function partialField(
       : { [key]: median(present) } as Record<typeof key, number>;
   }
   return fieldValues.every(isNumber)
-    ? { [key]: median(fieldValues) } as Record<typeof key, number>
+    ? { [key]: median(fieldValues as readonly number[]) } as Record<
+      typeof key,
+      number
+    >
     : {};
 }
 
diff --git a/packages/cli/src/bench/scenarios/failure.ts b/packages/cli/src/bench/scenarios/failure.ts
index 323d96e5e..88e31ad38 100644
--- a/packages/cli/src/bench/scenarios/failure.ts
+++ b/packages/cli/src/bench/scenarios/failure.ts
@@ -142,6 +142,7 @@ export const failureRunner: ScenarioRunner = {
         loadPlanOf(context.scenario, context.rng),
         send,
         context.clock,
+        context.signal,
       );
       return aggregateSamples(result.samples, {
         measuredWindowMs: measuredWindowMs(context.scenario),
diff --git a/packages/cli/src/bench/scenarios/fanout.ts b/packages/cli/src/bench/scenarios/fanout.ts
index 045fbff96..e0c2e248b 100644
--- a/packages/cli/src/bench/scenarios/fanout.ts
+++ b/packages/cli/src/bench/scenarios/fanout.ts
@@ -133,6 +133,7 @@ export const fanoutRunner: ScenarioRunner = {
         loadPlanOf(context.scenario, context.rng),
         send,
         context.clock,
+        context.signal,
       );
       const measurement = aggregateSamples(result.samples, {
         measuredWindowMs: measuredWindowMs(context.scenario),
diff --git a/packages/cli/src/bench/scenarios/inbox.ts b/packages/cli/src/bench/scenarios/inbox.ts
index 4bf8379d0..7718d13fd 100644
--- a/packages/cli/src/bench/scenarios/inbox.ts
+++ b/packages/cli/src/bench/scenarios/inbox.ts
@@ -137,6 +137,7 @@ export const inboxRunner: ScenarioRunner = {
         loadPlanOf(scenario, context.rng),
         send,
         context.clock,
+        context.signal,
       );
       const measurement = aggregateSamples(result.samples, {
         measuredWindowMs: measuredWindowMs(scenario),
diff --git a/packages/cli/src/bench/scenarios/read.ts b/packages/cli/src/bench/scenarios/read.ts
index 0c7f24b55..aacd15518 100644
--- a/packages/cli/src/bench/scenarios/read.ts
+++ b/packages/cli/src/bench/scenarios/read.ts
@@ -123,6 +123,7 @@ export async function runReadLoad(
       loadPlanOf(context.scenario, context.rng),
       send,
       context.clock,
+      context.signal,
     );
     const measurement = aggregateSamples(result.samples, {
       measuredWindowMs: measuredWindowMs(context.scenario),
diff --git a/packages/cli/src/bench/scenarios/runner.ts b/packages/cli/src/bench/scenarios/runner.ts
index eb1ce970d..c5ccd5477 100644
--- a/packages/cli/src/bench/scenarios/runner.ts
+++ b/packages/cli/src/bench/scenarios/runner.ts
@@ -31,6 +31,8 @@ export interface RunContext {
   readonly rng?: Rng;
   /** Fetch implementation (overridable for tests). */
   readonly fetch?: typeof fetch;
+  /** Aborts in-flight benchmark work when the orchestrator is interrupted. */
+  readonly signal?: AbortSignal;
   /** Host advertised for local benchmark-owned servers. */
   readonly advertiseHost?: string;
   /**
diff --git a/packages/cli/src/bench/scenarios/webfinger.ts b/packages/cli/src/bench/scenarios/webfinger.ts
index 14b77a093..3b2fbb049 100644
--- a/packages/cli/src/bench/scenarios/webfinger.ts
+++ b/packages/cli/src/bench/scenarios/webfinger.ts
@@ -64,6 +64,7 @@ export const webfingerRunner: ScenarioRunner = {
       loadPlanOf(context.scenario, context.rng),
       send,
       context.clock,
+      context.signal,
     );
     const measurement = aggregateSamples(result.samples, {
       measuredWindowMs: measuredWindowMs(context.scenario),

From ef2a57aeb3f46a87daf84575ec41bca132bb7f3f Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Tue, 16 Jun 2026 19:29:37 +0900
Subject: [PATCH 18/19] Harden benchmark comparison edge cases

Reject ambiguous unitless max-regression values greater than one, so
users who mean 15% do not accidentally configure a 1500% tolerance.

Make fanout and remote-failure queue polling use the scenario clock and
abort signal.  Interrupted comparison runs can now stop pending drain
polls instead of waiting until the queue drain timeout expires.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3419702102
https://github.com/fedify-dev/fedify/pull/804#discussion_r3419813404
https://github.com/fedify-dev/fedify/pull/804#discussion_r3419821231

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts        |  6 ++
 packages/cli/src/bench/compare.ts             |  6 ++
 .../cli/src/bench/scenarios/failure.test.ts   | 59 +++++++++++++++++++
 packages/cli/src/bench/scenarios/failure.ts   | 29 +++++++--
 .../cli/src/bench/scenarios/fanout.test.ts    | 58 ++++++++++++++++++
 packages/cli/src/bench/scenarios/fanout.ts    | 29 +++++++--
 6 files changed, 179 insertions(+), 8 deletions(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index f48e2ccd1..e730dc314 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -153,6 +153,7 @@ function fakeChildProcess(pid = 1234): FakeChildProcess {
 test("parseRegressionTolerance - parses percentages", () => {
   assert.strictEqual(parseRegressionTolerance("15%"), 0.15);
   assert.strictEqual(parseRegressionTolerance("0.2"), 0.2);
+  assert.strictEqual(parseRegressionTolerance("1"), 1);
 });
 
 test("parseRegressionTolerance - rejects malformed values", () => {
@@ -161,6 +162,11 @@ test("parseRegressionTolerance - rejects malformed values", () => {
   assert.throws(() => parseRegressionTolerance(""), RangeError);
 });
 
+test("parseRegressionTolerance - rejects ambiguous whole-number ratios", () => {
+  assert.throws(() => parseRegressionTolerance("15"), RangeError);
+  assert.throws(() => parseRegressionTolerance("1.01"), RangeError);
+});
+
 test("buildCompareReport - applies the measured noise band", () => {
   const base = report([scenario()]);
   const head = report([
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index 3c442d3c3..f47069431 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -352,6 +352,12 @@ export function parseRegressionTolerance(value: string): number {
       `Invalid --max-regression value: ${JSON.stringify(value)}.`,
     );
   }
+  if (match?.[2] == null && numeric > 1) {
+    throw new RangeError(
+      `Invalid --max-regression value: ${JSON.stringify(value)}; ` +
+        "use a ratio between 0 and 1 or an explicit percentage.",
+    );
+  }
   return match?.[2] === "%" ? numeric / 100 : numeric;
 }
 
diff --git a/packages/cli/src/bench/scenarios/failure.test.ts b/packages/cli/src/bench/scenarios/failure.test.ts
index a41984c5f..b27bf8851 100644
--- a/packages/cli/src/bench/scenarios/failure.test.ts
+++ b/packages/cli/src/bench/scenarios/failure.test.ts
@@ -351,6 +351,65 @@ test("failureRunner - tolerates transient remote fault stats failures", async ()
   assert.ok(statsCalls >= 3);
 });
 
+test("failureRunner - uses abortable remote fault poll sleeps", async () => {
+  const target = new URL("http://target.test/");
+  const signal = new AbortController().signal;
+  const sleepSignals: (AbortSignal | undefined)[] = [];
+  let now = 0;
+  const clock: Clock = {
+    now: () => now,
+    sleepUntil: (timeMs, signal) => {
+      now = Math.max(now, timeMs);
+      sleepSignals.push(signal);
+      return Promise.resolve();
+    },
+  };
+  const scenario = normalizeSuite({
+    version: 1,
+    target: target.href,
+    scenarios: [{
+      name: "failure",
+      type: "failure",
+      fault: "remote-404",
+      sender: "alice",
+      load: { rate: "1000/s" },
+      duration: "1ms",
+      queueDrainTimeout: "1ms",
+    }],
+  }).scenarios[0];
+  let triggerCalls = 0;
+
+  await failureRunner.run({
+    scenario,
+    target,
+    documentLoader: await getDocumentLoader({ allowPrivateAddress: true }),
+    contextLoader: await getContextLoader({ allowPrivateAddress: true }),
+    allowPrivateAddress: true,
+    fleet: null,
+    fetch: (input) => {
+      const url = new URL(input instanceof Request ? input.url : input);
+      if (url.pathname === "/.well-known/fedify/bench/stats") {
+        return Promise.resolve(statsJson(statsSnapshot({
+          enqueued: triggerCalls,
+          completed: triggerCalls,
+          failed: 0,
+          permanentFailures: 0,
+        })));
+      }
+      if (url.pathname === "/.well-known/fedify/bench/trigger") {
+        triggerCalls++;
+        return Promise.resolve(statsJson({ version: 1 }, 202));
+      }
+      return Promise.resolve(new Response("unexpected", { status: 500 }));
+    },
+    assertDestinationAllowed: () => {},
+    clock,
+    signal,
+  });
+
+  assert.deepStrictEqual(sleepSignals, [signal, signal]);
+});
+
 test("failureRunner.validate - requires sender for remote faults", () => {
   const scenario = normalizeSuite({
     version: 1,
diff --git a/packages/cli/src/bench/scenarios/failure.ts b/packages/cli/src/bench/scenarios/failure.ts
index 88e31ad38..9b02359f5 100644
--- a/packages/cli/src/bench/scenarios/failure.ts
+++ b/packages/cli/src/bench/scenarios/failure.ts
@@ -7,6 +7,7 @@
 import { Create, Note } from "@fedify/vocab";
 import { discoverInbox, selectInbox } from "../discovery/discover.ts";
 import { runLoad, type SendOutcome } from "../load/generator.ts";
+import { type Clock, systemClock } from "../load/clock.ts";
 import { aggregateSamples } from "../metrics/aggregate.ts";
 import {
   diffSnapshots,
@@ -108,6 +109,7 @@ export const failureRunner: ScenarioRunner = {
 
   async run(context: RunContext) {
     this.validate?.(context.scenario);
+    const clock = context.clock ?? systemClock();
     const faults = faultsOf(context);
     const deliveryTarget = await resolveFailureDeliveryTarget(context, faults);
     const remoteTargets = await resolveRemoteFailureTargets(context, faults);
@@ -141,7 +143,7 @@ export const failureRunner: ScenarioRunner = {
       const result = await runLoad(
         loadPlanOf(context.scenario, context.rng),
         send,
-        context.clock,
+        clock,
         context.signal,
       );
       return aggregateSamples(result.samples, {
@@ -338,6 +340,8 @@ async function sendRemoteFailure(
     fetch: fetchImpl,
     baseline,
     fault,
+    clock: context.clock ?? systemClock(),
+    signal: context.signal,
     timeoutMs: context.scenario.queueDrainTimeoutMs ??
       DEFAULT_DRAIN_TIMEOUT_MS,
   });
@@ -385,13 +389,17 @@ async function waitForRemoteFault(options: {
   readonly fetch: typeof fetch;
   readonly baseline: Awaited<ReturnType<typeof fetchServerSnapshot>>;
   readonly fault: RemoteFailureFault;
+  readonly clock: Clock;
+  readonly signal?: AbortSignal;
   readonly timeoutMs: number;
 }): Promise<RemoteFaultObservation | null> {
   if (options.baseline == null) return null;
   const baselineRemaining = queueTaskRemaining(options.baseline) ?? 0;
-  const deadline = Date.now() + options.timeoutMs;
+  const deadline = options.clock.now() + options.timeoutMs;
   do {
+    throwIfAborted(options.signal);
     const snapshot = await fetchServerSnapshot(options.target, options.fetch);
+    throwIfAborted(options.signal);
     if (snapshot != null) {
       const diff = diffSnapshots(options.baseline, snapshot);
       const queueTasks = diff.queueTasks;
@@ -417,11 +425,24 @@ async function waitForRemoteFault(options: {
         }
       }
     }
-    await new Promise((resolve) => setTimeout(resolve, DRAIN_POLL_MS));
-  } while (Date.now() < deadline);
+    const now = options.clock.now();
+    if (now >= deadline) break;
+    await options.clock.sleepUntil(
+      Math.min(deadline, now + DRAIN_POLL_MS),
+      options.signal,
+    );
+  } while (options.clock.now() < deadline);
   return { timedOut: true };
 }
 
+function throwIfAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) throw abortReason(signal);
+}
+
+function abortReason(signal: AbortSignal): unknown {
+  return signal.reason ?? new Error("Operation aborted.");
+}
+
 function expectedRemoteFailure(fault: RemoteFailureFault): SendOutcome {
   switch (fault) {
     case "remote-404":
diff --git a/packages/cli/src/bench/scenarios/fanout.test.ts b/packages/cli/src/bench/scenarios/fanout.test.ts
index a81ed7ccb..070dd4717 100644
--- a/packages/cli/src/bench/scenarios/fanout.test.ts
+++ b/packages/cli/src/bench/scenarios/fanout.test.ts
@@ -2,6 +2,7 @@ import assert from "node:assert/strict";
 import test from "node:test";
 import { serve } from "srvx";
 import { getContextLoader, getDocumentLoader } from "../../docloader.ts";
+import type { Clock } from "../load/clock.ts";
 import { normalizeSuite } from "../scenario/normalize.ts";
 import type { Suite } from "../scenario/types.ts";
 import { fanoutRunner, spawnSinkServer } from "./fanout.ts";
@@ -358,6 +359,63 @@ test("fanoutRunner - tolerates transient drain stats failures", async () => {
   assert.ok(statsCalls >= 3);
 });
 
+test("fanoutRunner - uses abortable drain poll sleeps", async () => {
+  const target = new URL("http://target.test/");
+  const signal = new AbortController().signal;
+  const sleepSignals: (AbortSignal | undefined)[] = [];
+  let now = 0;
+  const clock: Clock = {
+    now: () => now,
+    sleepUntil: (timeMs, signal) => {
+      now = Math.max(now, timeMs);
+      sleepSignals.push(signal);
+      return Promise.resolve();
+    },
+  };
+  const scenario = normalizeSuite({
+    version: 1,
+    target: target.href,
+    scenarios: [{
+      name: "fanout",
+      type: "fanout",
+      sender: "alice",
+      followers: 5,
+      load: { rate: "1000/s" },
+      duration: "1ms",
+      queueDrainTimeout: "1ms",
+    }],
+  }).scenarios[0];
+  let triggerCalls = 0;
+
+  await fanoutRunner.run({
+    scenario,
+    target,
+    documentLoader: await getDocumentLoader({ allowPrivateAddress: true }),
+    contextLoader: await getContextLoader({ allowPrivateAddress: true }),
+    allowPrivateAddress: true,
+    fleet: null,
+    fetch: (input) => {
+      const url = new URL(input instanceof Request ? input.url : input);
+      if (url.pathname === "/.well-known/fedify/bench/stats") {
+        return Promise.resolve(json(statsSnapshot({
+          enqueued: triggerCalls * 6,
+          completed: 0,
+          failed: 0,
+        })));
+      }
+      if (url.pathname === "/.well-known/fedify/bench/trigger") {
+        triggerCalls++;
+        return Promise.resolve(json({ version: 1 }, 202));
+      }
+      return Promise.resolve(new Response("unexpected", { status: 500 }));
+    },
+    clock,
+    signal,
+  });
+
+  assert.deepStrictEqual(sleepSignals, [signal, signal]);
+});
+
 test("fanoutRunner - uses configured sink base for recipients", async () => {
   const target = new URL("http://target.test/");
   const sinkBase = `http://127.0.0.1:${await reservePort()}/`;
diff --git a/packages/cli/src/bench/scenarios/fanout.ts b/packages/cli/src/bench/scenarios/fanout.ts
index e0c2e248b..7e5670acb 100644
--- a/packages/cli/src/bench/scenarios/fanout.ts
+++ b/packages/cli/src/bench/scenarios/fanout.ts
@@ -6,6 +6,7 @@
 
 import { serve } from "srvx";
 import { runLoad, type SendOutcome } from "../load/generator.ts";
+import { type Clock, systemClock } from "../load/clock.ts";
 import { aggregateSamples } from "../metrics/aggregate.ts";
 import { LogLinearHistogram } from "../metrics/histogram.ts";
 import {
@@ -46,6 +47,7 @@ export const fanoutRunner: ScenarioRunner = {
       throw new Error("The fanout scenario requires a sender.");
     }
     this.validate?.(context.scenario);
+    const clock = context.clock ?? systemClock();
     const fetchImpl = context.fetch ?? fetch;
     const followers = context.scenario.followers ?? DEFAULT_FOLLOWERS;
     const sink = await spawnSinkServer({
@@ -90,6 +92,8 @@ export const fanoutRunner: ScenarioRunner = {
           target: context.target,
           fetch: fetchImpl,
           baseline,
+          clock,
+          signal: context.signal,
           timeoutMs: context.scenario.queueDrainTimeoutMs ??
             DEFAULT_DRAIN_TIMEOUT_MS,
         });
@@ -132,7 +136,7 @@ export const fanoutRunner: ScenarioRunner = {
       const result = await runLoad(
         loadPlanOf(context.scenario, context.rng),
         send,
-        context.clock,
+        clock,
         context.signal,
       );
       const measurement = aggregateSamples(result.samples, {
@@ -329,13 +333,17 @@ async function waitForDrain(options: {
   readonly target: URL;
   readonly fetch: typeof fetch;
   readonly baseline: Awaited<ReturnType<typeof fetchServerSnapshot>>;
+  readonly clock: Clock;
+  readonly signal?: AbortSignal;
   readonly timeoutMs: number;
 }): Promise<DrainResult | null> {
   if (options.baseline == null) return null;
   const baselineRemaining = queueTaskRemaining(options.baseline) ?? 0;
-  const deadline = Date.now() + options.timeoutMs;
+  const deadline = options.clock.now() + options.timeoutMs;
   do {
+    throwIfAborted(options.signal);
     const snapshot = await fetchServerSnapshot(options.target, options.fetch);
+    throwIfAborted(options.signal);
     if (snapshot != null) {
       const diff = diffSnapshots(options.baseline, snapshot);
       const queueTasks = diff.queueTasks;
@@ -349,11 +357,24 @@ async function waitForDrain(options: {
         return { timedOut: false, failed: queueTasks.failed };
       }
     }
-    await new Promise((resolve) => setTimeout(resolve, DRAIN_POLL_MS));
-  } while (Date.now() < deadline);
+    const now = options.clock.now();
+    if (now >= deadline) break;
+    await options.clock.sleepUntil(
+      Math.min(deadline, now + DRAIN_POLL_MS),
+      options.signal,
+    );
+  } while (options.clock.now() < deadline);
   return { timedOut: true, failed: 0 };
 }
 
+function throwIfAborted(signal: AbortSignal | undefined): void {
+  if (signal?.aborted) throw abortReason(signal);
+}
+
+function abortReason(signal: AbortSignal): unknown {
+  return signal.reason ?? new Error("Operation aborted.");
+}
+
 function addQueueDrain(
   server: ServerMetrics | null,
   histogram: LogLinearHistogram,

From 28c2ce47723964a80024942bc86dcfd13e0a650c Mon Sep 17 00:00:00 2001
From: Hong Minhee <hong@minhee.org>
Date: Tue, 16 Jun 2026 19:54:56 +0900
Subject: [PATCH 19/19] Quiet target exits during benchmark cleanup

Normal target shutdown can make the target exit promise reject after the
comparison has already completed a side.  Keep that exit from propagating
as a late target failure while still aborting active work for unexpected
process exits.

https://github.com/fedify-dev/fedify/pull/804#discussion_r3420017088

Assisted-by: Codex:gpt-5.5
---
 packages/cli/src/bench/compare.test.ts | 59 ++++++++++++++++++++++++++
 packages/cli/src/bench/compare.ts      |  3 +-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/packages/cli/src/bench/compare.test.ts b/packages/cli/src/bench/compare.test.ts
index e730dc314..0bf3317ce 100644
--- a/packages/cli/src/bench/compare.test.ts
+++ b/packages/cli/src/bench/compare.test.ts
@@ -980,6 +980,65 @@ test("runBenchCompare - aborts benchmark work when target exits", async () => {
   ]);
 });
 
+test("runBenchCompare - ignores target exit while stopping normally", async () => {
+  const events: string[] = [];
+  let rejectExit!: (error: Error) => void;
+  let code = -1;
+  await runBenchCompare(command({}), {
+    exit: (c) => {
+      code = c;
+    },
+    writeOutput: () => {
+      events.push("write");
+      return Promise.resolve();
+    },
+    log: (message) => events.push(`log:${message}`),
+    createWorktree: (_ref, label) => Promise.resolve(`/tmp/${label}`),
+    removeWorktree: (path) => {
+      events.push(`remove:${path}`);
+      return Promise.resolve();
+    },
+    startTarget: (cwd) => {
+      events.push(`start:${cwd}`);
+      return Promise.resolve({
+        exited: new Promise<never>((_resolve, reject) => {
+          rejectExit = reject;
+        }),
+        stop: () => {
+          events.push(`stop:${cwd}`);
+          rejectExit(new Error("target stopped"));
+          return Promise.resolve();
+        },
+      });
+    },
+    waitReady: () => {
+      events.push("ready");
+      return Promise.resolve();
+    },
+    runBenchInWorktree: () => {
+      events.push("bench");
+      return Promise.resolve(report([scenario()]));
+    },
+  });
+  await Promise.resolve();
+  assert.strictEqual(code, 0);
+  assert.deepEqual(events, [
+    "log:Checking out base benchmark ref origin/main…",
+    "start:/tmp/base",
+    "ready",
+    "bench",
+    "stop:/tmp/base",
+    "log:Checking out head benchmark ref HEAD…",
+    "start:/tmp/head",
+    "ready",
+    "bench",
+    "stop:/tmp/head",
+    "write",
+    "remove:/tmp/head",
+    "remove:/tmp/base",
+  ]);
+});
+
 test("runBenchCompare - fails when target exits before readiness", async () => {
   const events: string[] = [];
   let code = -1;
diff --git a/packages/cli/src/bench/compare.ts b/packages/cli/src/bench/compare.ts
index f47069431..61b353e61 100644
--- a/packages/cli/src/bench/compare.ts
+++ b/packages/cli/src/bench/compare.ts
@@ -282,8 +282,9 @@ export async function runBenchCompare(
       targetExitError = error;
       if (!stoppingTarget && !interruptSignal.aborted) {
         interruptController.abort(error);
+        throw error;
       }
-      throw error;
+      return new Promise<never>(() => {});
     });
     const throwIfTargetExited = () => {
       if (targetExitError != null) throw targetExitError;