From d2ecab0bc2b9836dc971bb08379b1ae05a6bde74 Mon Sep 17 00:00:00 2001 From: Andrew Stilliard Date: Wed, 17 Jun 2026 19:33:23 +0100 Subject: [PATCH 1/3] feat(cluster): add multi-worker cluster support for outbound write throughput MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub API writes (checks.create) are serialized by Octokit's throttling plugin at maxConcurrent:1, minTime:1000 (≈1 write/sec) keyed per installation, queued in-memory per process. Under load a busy installation backs up behind this gate while GET lookups stay fast, causing multi-minute delays on the final check creation. Add a cluster.js wrapper as the process entry point: the primary forks WEB_CONCURRENCY workers (default 4), each running the existing Probot app unchanged via probot.run(). Each worker has its own in-memory write queue, so aggregate write throughput scales with worker count. The queue is timer-gated (idle-waiting), so this helps even on single-core droplets. - start now runs the cluster; start:single keeps the prior single-process path - dev/dev-debug pinned to start:single so local smee dev doesn't fan out events across workers - crashed workers respawn; SIGTERM/SIGINT shuts workers down without respawn Co-Authored-By: Claude Opus 4.8 --- cluster.js | 22 ++++++++++++++++++++++ package.json | 7 ++++--- 2 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 cluster.js diff --git a/cluster.js b/cluster.js new file mode 100644 index 0000000..8a15e77 --- /dev/null +++ b/cluster.js @@ -0,0 +1,22 @@ +const cluster = require('cluster'); + +if (cluster.isPrimary) { + const workers = parseInt(process.env.WEB_CONCURRENCY, 10) || 4; + console.log(`Primary ${process.pid} starting ${workers} workers`); + for (let i = 0; i < workers; i++) cluster.fork(); + + // resurrect a worker if it dies, unless we're shutting down + let shuttingDown = false; + cluster.on('exit', (worker, code, signal) => { + console.log(`Worker ${worker.process.pid} died (${signal || code})`); + if (!shuttingDown) cluster.fork(); + }); + for (const sig of ['SIGTERM', 'SIGINT']) { + process.on(sig, () => { + shuttingDown = true; + for (const w of Object.values(cluster.workers)) w.kill(sig); + }); + } +} else { + require('probot').run(require('./index')); +} diff --git a/package.json b/package.json index 575dcd9..ae9cf67 100644 --- a/package.json +++ b/package.json @@ -9,9 +9,10 @@ "license": "ISC", "repository": "https://github.com/stilliard/github-task-list-completed.git", "scripts": { - "dev-debug": "nodemon --exec \"LOG_LEVEL=debug npm start\"", - "dev": "nodemon --exec \"npm start\"", - "start": "probot run ./index.js", + "dev-debug": "nodemon --exec \"LOG_LEVEL=debug npm run start:single\"", + "dev": "nodemon --exec \"npm run start:single\"", + "start": "node cluster.js", + "start:single": "probot run ./index.js", "test": "jest" }, "dependencies": { From 9869524ce062413efc4786ab08be32f625630884 Mon Sep 17 00:00:00 2001 From: Andrew Stilliard Date: Wed, 17 Jun 2026 19:43:59 +0100 Subject: [PATCH 2/3] fix(cluster): prevent fast-crash respawn loop and restore default app routes Address review feedback on the cluster entrypoint: - P1: a worker that died at startup (bad config, bind failure, throw in run()) was respawned instantly in a tight loop. Track per-worker uptime; deaths under 5s count as fast-crashes, and after WEB_CONCURRENCY of them the primary exits(1) so the service manager can apply its restart/backoff policy. Healthy runs reset the streak and still respawn normally. - P3: the function form of run() skipped Probot's default app, dropping the `/` and `/probot` setup routes (the latter is documented in the README). Drive the same argv path as `probot run ./index.js` so the default app loads first, keeping behaviour identical to the prior entrypoint. Co-Authored-By: Claude Opus 4.8 --- cluster.js | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/cluster.js b/cluster.js index 8a15e77..09ef1ae 100644 --- a/cluster.js +++ b/cluster.js @@ -3,14 +3,28 @@ const cluster = require('cluster'); if (cluster.isPrimary) { const workers = parseInt(process.env.WEB_CONCURRENCY, 10) || 4; console.log(`Primary ${process.pid} starting ${workers} workers`); - for (let i = 0; i < workers; i++) cluster.fork(); + for (let i = 0; i < workers; i++) fork(); - // resurrect a worker if it dies, unless we're shutting down + // resurrect a worker if it dies, unless we're shutting down, also track fast crashes and exit if too many happen in a row let shuttingDown = false; + let fastCrashes = 0; cluster.on('exit', (worker, code, signal) => { - console.log(`Worker ${worker.process.pid} died (${signal || code})`); - if (!shuttingDown) cluster.fork(); + if (shuttingDown) return; + const ranForMs = Date.now() - worker.startedAt; + if (ranForMs < 5000) { + fastCrashes++; + console.error(`Worker ${worker.process.pid} died after ${ranForMs}ms (${signal || code}) [fast-crash ${fastCrashes}/${workers}]`); + if (fastCrashes >= workers) { + console.error('Too many fast worker crashes; exiting for the service manager to restart.'); + process.exit(1); + } + } else { + fastCrashes = 0; // a healthy run clears the streak + console.log(`Worker ${worker.process.pid} died after ${Math.round(ranForMs / 1000)}s (${signal || code}); respawning`); + } + fork(); }); + for (const sig of ['SIGTERM', 'SIGINT']) { process.on(sig, () => { shuttingDown = true; @@ -18,5 +32,11 @@ if (cluster.isPrimary) { }); } } else { - require('probot').run(require('./index')); + require('probot').run([process.argv[0], process.argv[1], './index.js']); +} + +function fork() { + const worker = cluster.fork(); + worker.startedAt = Date.now(); + return worker; } From b96417c90e014907648036eec03ed37b28e6d0b4 Mon Sep 17 00:00:00 2001 From: Andrew Stilliard Date: Wed, 17 Jun 2026 20:13:46 +0100 Subject: [PATCH 3/3] fix(cluster): fall back to default workers for non-positive WEB_CONCURRENCY A negative WEB_CONCURRENCY forked zero workers (service up, processing nothing). Only accept positive integers, otherwise use the default. Co-Authored-By: Claude Opus 4.8 --- cluster.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cluster.js b/cluster.js index 09ef1ae..ece25d9 100644 --- a/cluster.js +++ b/cluster.js @@ -1,7 +1,8 @@ const cluster = require('cluster'); if (cluster.isPrimary) { - const workers = parseInt(process.env.WEB_CONCURRENCY, 10) || 4; + const requested = parseInt(process.env.WEB_CONCURRENCY, 10); + const workers = Number.isInteger(requested) && requested > 0 ? requested : 4; console.log(`Primary ${process.pid} starting ${workers} workers`); for (let i = 0; i < workers; i++) fork(); @@ -32,6 +33,7 @@ if (cluster.isPrimary) { }); } } else { + // equivalent to `probot run ./index.js` (app path passed as a positional arg) require('probot').run([process.argv[0], process.argv[1], './index.js']); }