From 8ecd6a1e484d6020d61001fb8a0cd2930507b77a Mon Sep 17 00:00:00 2001
From: Chris Alfano <chris@jarv.us>
Date: Thu, 25 Jun 2026 08:22:54 -0400
Subject: [PATCH 1/2] fix(deploy): raise heap + memory limit for full published
 import

A cold boot rebuilding in-memory state from the full `published` import
(~31.8k people, ~10.4k tag-assignments, plus secondary indices) OOM'd at
the previous 1536Mi V8 old-space ceiling: "FATAL ERROR: Reached heap limit
Allocation failed - JavaScript heap out of memory". The long-running pod had
been serving state from an earlier, lighter boot and never had to rebuild;
a rollout restart forced the first cold load of the current import and it no
longer fit.

The native FTS5 store is off-heap (better-sqlite3), so the V8 heap holds the
record maps + indices. Raise NODE_OPTIONS --max-old-space-size 1536 -> 3072
and the container memory limit 2Gi -> 3.5Gi (nodes are 3.9Gi), with requests
768Mi -> 1Gi. The ~60x on-disk-to-heap expansion is suspiciously large and
is tracked separately for a memory-optimization investigation.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 deploy/kustomize/base/configmap.yaml  |  9 ++++++++-
 deploy/kustomize/base/deployment.yaml | 11 ++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/deploy/kustomize/base/configmap.yaml b/deploy/kustomize/base/configmap.yaml
index ca57e4b..fc43b77 100644
--- a/deploy/kustomize/base/configmap.yaml
+++ b/deploy/kustomize/base/configmap.yaml
@@ -21,7 +21,14 @@ data:
   # containers) so the in-memory record set + secondary indices + FTS fit.
   # Keep ~25% headroom below the container's memory limit for ephemeral
   # work (request handling, gitsheets tree mutations, etc.).
-  NODE_OPTIONS: "--max-old-space-size=1536"
+  #
+  # 3072 (was 1536): a fresh boot rebuilding in-memory state from the full
+  # `published` import (~31.8k people + ~10.4k tag-assignments + secondary
+  # indices) OOM'd at the 1536 ceiling. The native FTS5 store is off-heap;
+  # the V8 heap holds the record maps + indices, which sit well above 1.5Gi
+  # on a cold build. See the memory-optimization issue for the (suspiciously
+  # large ~60x) on-disk-to-heap expansion worth investigating separately.
+  NODE_OPTIONS: "--max-old-space-size=3072"
   PORT: "3001"
   STORAGE_BACKEND: "filesystem"
   # The runtime-served branch. `published` is the long-term sandbox + prod
diff --git a/deploy/kustomize/base/deployment.yaml b/deploy/kustomize/base/deployment.yaml
index 2486e3a..0c9672a 100644
--- a/deploy/kustomize/base/deployment.yaml
+++ b/deploy/kustomize/base/deployment.yaml
@@ -64,12 +64,17 @@ spec:
           resources:
             requests:
               cpu: 100m
-              memory: 768Mi
+              memory: 1Gi
             limits:
               cpu: 1000m
               # Holds the full public dataset + secondary indices + FTS in
-              # memory (~31k people, 268 projects, 10k tag-assignments, …)
-              memory: 2Gi
+              # memory (~31.8k people, 268 projects, 10.4k tag-assignments, …).
+              # 3.5Gi (was 2Gi): a cold boot rebuilding state from the full
+              # `published` import exceeded the prior 1536Mi heap; raised to
+              # NODE_OPTIONS=--max-old-space-size=3072 (see configmap.yaml),
+              # so the container limit needs headroom above that for the
+              # off-heap FTS5 store + ephemeral request work. Nodes are 3.9Gi.
+              memory: 3584Mi
           securityContext:
             runAsNonRoot: true
             runAsUser: 1000

From 03e0acd1ffb293ae5452af055fe1871ce9520767 Mon Sep 17 00:00:00 2001
From: Chris Alfano <chris@jarv.us>
Date: Thu, 25 Jun 2026 17:35:14 -0400
Subject: [PATCH 2/2] fix(deploy): use node-safe heap/limit (2048 / 2.5Gi), not
 3072 / 3.5Gi

Correcting this branch's first attempt. The initial 3072 heap / 3.5Gi limit
restored the boot but was too large for the ~3.9Gi nodes: as the pod grew it
starved the node's kubelet and drove it NodeNotReady, which cascaded into an
RWO volume multi-attach deadlock and a longer outage.

The proven-safe values (live for 8h, 0 restarts): heap 2048 / container limit
2560Mi / request 1Gi. 2048 boots the full `published` import cleanly; capping
the container at 2.5Gi leaves ~1.4Gi node headroom so a single pod can't take
a node down again. Reducing the footprint further is tracked in the
memory-optimization issue.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 deploy/kustomize/base/configmap.yaml  | 12 +++++++-----
 deploy/kustomize/base/deployment.yaml | 12 +++++++-----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/deploy/kustomize/base/configmap.yaml b/deploy/kustomize/base/configmap.yaml
index fc43b77..925de97 100644
--- a/deploy/kustomize/base/configmap.yaml
+++ b/deploy/kustomize/base/configmap.yaml
@@ -22,13 +22,15 @@ data:
   # Keep ~25% headroom below the container's memory limit for ephemeral
   # work (request handling, gitsheets tree mutations, etc.).
   #
-  # 3072 (was 1536): a fresh boot rebuilding in-memory state from the full
+  # 2048 (was 1536): a fresh boot rebuilding in-memory state from the full
   # `published` import (~31.8k people + ~10.4k tag-assignments + secondary
   # indices) OOM'd at the 1536 ceiling. The native FTS5 store is off-heap;
-  # the V8 heap holds the record maps + indices, which sit well above 1.5Gi
-  # on a cold build. See the memory-optimization issue for the (suspiciously
-  # large ~60x) on-disk-to-heap expansion worth investigating separately.
-  NODE_OPTIONS: "--max-old-space-size=3072"
+  # the V8 heap holds the record maps + indices. 2048 boots cleanly and runs
+  # stable; it is deliberately kept modest because these nodes are only
+  # ~3.9Gi — an earlier 3072/3.5Gi trial let the pod grow until it starved
+  # the node's kubelet (NodeNotReady). See the memory-optimization issue for
+  # the (suspiciously large ~60x) on-disk-to-heap expansion worth reducing.
+  NODE_OPTIONS: "--max-old-space-size=2048"
   PORT: "3001"
   STORAGE_BACKEND: "filesystem"
   # The runtime-served branch. `published` is the long-term sandbox + prod
diff --git a/deploy/kustomize/base/deployment.yaml b/deploy/kustomize/base/deployment.yaml
index 0c9672a..bddf30b 100644
--- a/deploy/kustomize/base/deployment.yaml
+++ b/deploy/kustomize/base/deployment.yaml
@@ -69,12 +69,14 @@ spec:
               cpu: 1000m
               # Holds the full public dataset + secondary indices + FTS in
               # memory (~31.8k people, 268 projects, 10.4k tag-assignments, …).
-              # 3.5Gi (was 2Gi): a cold boot rebuilding state from the full
+              # 2.5Gi (was 2Gi): a cold boot rebuilding state from the full
               # `published` import exceeded the prior 1536Mi heap; raised to
-              # NODE_OPTIONS=--max-old-space-size=3072 (see configmap.yaml),
-              # so the container limit needs headroom above that for the
-              # off-heap FTS5 store + ephemeral request work. Nodes are 3.9Gi.
-              memory: 3584Mi
+              # NODE_OPTIONS=--max-old-space-size=2048 (see configmap.yaml),
+              # with the container limit just above that for the off-heap FTS5
+              # store + ephemeral request work. Kept at 2.5Gi (not higher) so
+              # a single pod can't starve a node — these nodes are only ~3.9Gi
+              # and a 3.5Gi trial drove one NodeNotReady. ~1.4Gi node headroom.
+              memory: 2560Mi
           securityContext:
             runAsNonRoot: true
             runAsUser: 1000