From 8ecd6a1e484d6020d61001fb8a0cd2930507b77a Mon Sep 17 00:00:00 2001 From: Chris Alfano Date: Thu, 25 Jun 2026 08:22:54 -0400 Subject: [PATCH 1/2] fix(deploy): raise heap + memory limit for full published import A cold boot rebuilding in-memory state from the full `published` import (~31.8k people, ~10.4k tag-assignments, plus secondary indices) OOM'd at the previous 1536Mi V8 old-space ceiling: "FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory". The long-running pod had been serving state from an earlier, lighter boot and never had to rebuild; a rollout restart forced the first cold load of the current import and it no longer fit. The native FTS5 store is off-heap (better-sqlite3), so the V8 heap holds the record maps + indices. Raise NODE_OPTIONS --max-old-space-size 1536 -> 3072 and the container memory limit 2Gi -> 3.5Gi (nodes are 3.9Gi), with requests 768Mi -> 1Gi. The ~60x on-disk-to-heap expansion is suspiciously large and is tracked separately for a memory-optimization investigation. Co-Authored-By: Claude Opus 4.8 (1M context) --- deploy/kustomize/base/configmap.yaml | 9 ++++++++- deploy/kustomize/base/deployment.yaml | 11 ++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/deploy/kustomize/base/configmap.yaml b/deploy/kustomize/base/configmap.yaml index ca57e4b..fc43b77 100644 --- a/deploy/kustomize/base/configmap.yaml +++ b/deploy/kustomize/base/configmap.yaml @@ -21,7 +21,14 @@ data: # containers) so the in-memory record set + secondary indices + FTS fit. # Keep ~25% headroom below the container's memory limit for ephemeral # work (request handling, gitsheets tree mutations, etc.). - NODE_OPTIONS: "--max-old-space-size=1536" + # + # 3072 (was 1536): a fresh boot rebuilding in-memory state from the full + # `published` import (~31.8k people + ~10.4k tag-assignments + secondary + # indices) OOM'd at the 1536 ceiling. The native FTS5 store is off-heap; + # the V8 heap holds the record maps + indices, which sit well above 1.5Gi + # on a cold build. See the memory-optimization issue for the (suspiciously + # large ~60x) on-disk-to-heap expansion worth investigating separately. + NODE_OPTIONS: "--max-old-space-size=3072" PORT: "3001" STORAGE_BACKEND: "filesystem" # The runtime-served branch. `published` is the long-term sandbox + prod diff --git a/deploy/kustomize/base/deployment.yaml b/deploy/kustomize/base/deployment.yaml index 2486e3a..0c9672a 100644 --- a/deploy/kustomize/base/deployment.yaml +++ b/deploy/kustomize/base/deployment.yaml @@ -64,12 +64,17 @@ spec: resources: requests: cpu: 100m - memory: 768Mi + memory: 1Gi limits: cpu: 1000m # Holds the full public dataset + secondary indices + FTS in - # memory (~31k people, 268 projects, 10k tag-assignments, …) - memory: 2Gi + # memory (~31.8k people, 268 projects, 10.4k tag-assignments, …). + # 3.5Gi (was 2Gi): a cold boot rebuilding state from the full + # `published` import exceeded the prior 1536Mi heap; raised to + # NODE_OPTIONS=--max-old-space-size=3072 (see configmap.yaml), + # so the container limit needs headroom above that for the + # off-heap FTS5 store + ephemeral request work. Nodes are 3.9Gi. + memory: 3584Mi securityContext: runAsNonRoot: true runAsUser: 1000 From 03e0acd1ffb293ae5452af055fe1871ce9520767 Mon Sep 17 00:00:00 2001 From: Chris Alfano Date: Thu, 25 Jun 2026 17:35:14 -0400 Subject: [PATCH 2/2] fix(deploy): use node-safe heap/limit (2048 / 2.5Gi), not 3072 / 3.5Gi Correcting this branch's first attempt. The initial 3072 heap / 3.5Gi limit restored the boot but was too large for the ~3.9Gi nodes: as the pod grew it starved the node's kubelet and drove it NodeNotReady, which cascaded into an RWO volume multi-attach deadlock and a longer outage. The proven-safe values (live for 8h, 0 restarts): heap 2048 / container limit 2560Mi / request 1Gi. 2048 boots the full `published` import cleanly; capping the container at 2.5Gi leaves ~1.4Gi node headroom so a single pod can't take a node down again. Reducing the footprint further is tracked in the memory-optimization issue. Co-Authored-By: Claude Opus 4.8 (1M context) --- deploy/kustomize/base/configmap.yaml | 12 +++++++----- deploy/kustomize/base/deployment.yaml | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/deploy/kustomize/base/configmap.yaml b/deploy/kustomize/base/configmap.yaml index fc43b77..925de97 100644 --- a/deploy/kustomize/base/configmap.yaml +++ b/deploy/kustomize/base/configmap.yaml @@ -22,13 +22,15 @@ data: # Keep ~25% headroom below the container's memory limit for ephemeral # work (request handling, gitsheets tree mutations, etc.). # - # 3072 (was 1536): a fresh boot rebuilding in-memory state from the full + # 2048 (was 1536): a fresh boot rebuilding in-memory state from the full # `published` import (~31.8k people + ~10.4k tag-assignments + secondary # indices) OOM'd at the 1536 ceiling. The native FTS5 store is off-heap; - # the V8 heap holds the record maps + indices, which sit well above 1.5Gi - # on a cold build. See the memory-optimization issue for the (suspiciously - # large ~60x) on-disk-to-heap expansion worth investigating separately. - NODE_OPTIONS: "--max-old-space-size=3072" + # the V8 heap holds the record maps + indices. 2048 boots cleanly and runs + # stable; it is deliberately kept modest because these nodes are only + # ~3.9Gi — an earlier 3072/3.5Gi trial let the pod grow until it starved + # the node's kubelet (NodeNotReady). See the memory-optimization issue for + # the (suspiciously large ~60x) on-disk-to-heap expansion worth reducing. + NODE_OPTIONS: "--max-old-space-size=2048" PORT: "3001" STORAGE_BACKEND: "filesystem" # The runtime-served branch. `published` is the long-term sandbox + prod diff --git a/deploy/kustomize/base/deployment.yaml b/deploy/kustomize/base/deployment.yaml index 0c9672a..bddf30b 100644 --- a/deploy/kustomize/base/deployment.yaml +++ b/deploy/kustomize/base/deployment.yaml @@ -69,12 +69,14 @@ spec: cpu: 1000m # Holds the full public dataset + secondary indices + FTS in # memory (~31.8k people, 268 projects, 10.4k tag-assignments, …). - # 3.5Gi (was 2Gi): a cold boot rebuilding state from the full + # 2.5Gi (was 2Gi): a cold boot rebuilding state from the full # `published` import exceeded the prior 1536Mi heap; raised to - # NODE_OPTIONS=--max-old-space-size=3072 (see configmap.yaml), - # so the container limit needs headroom above that for the - # off-heap FTS5 store + ephemeral request work. Nodes are 3.9Gi. - memory: 3584Mi + # NODE_OPTIONS=--max-old-space-size=2048 (see configmap.yaml), + # with the container limit just above that for the off-heap FTS5 + # store + ephemeral request work. Kept at 2.5Gi (not higher) so + # a single pod can't starve a node — these nodes are only ~3.9Gi + # and a 3.5Gi trial drove one NodeNotReady. ~1.4Gi node headroom. + memory: 2560Mi securityContext: runAsNonRoot: true runAsUser: 1000