optave · carlos-alm · May 6, 2026 · May 5, 2026 · May 6, 2026 · May 6, 2026
diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts
@@ -43,7 +43,14 @@ import {
 import { writeJournalHeader } from '../journal.js';
 import { setWorkspaces } from '../resolve.js';
 import { PipelineContext } from './context.js';
-import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
+import {
+  batchInsertNodes,
+  collectFiles as collectFilesUtil,
+  fileHash,
+  fileStat,
+  loadPathAliases,
+  readFileSafe,
+} from './helpers.js';
 import { NativeDbProxy } from './native-db-proxy.js';
 import { buildEdges } from './stages/build-edges.js';
 import { buildStructure } from './stages/build-structure.js';
@@ -731,12 +738,15 @@ async function tryNativeOrchestrator(
   // stale native binaries). WASM handles those — backfill via WASM so both
   // engines process the same file set (#967).
   //
-  // Only runs on full builds: incremental builds only touch changed files,
-  // which are parsed through parseFilesAuto (which has its own per-file
-  // backfill), so a full filesystem scan here would be wasted work.
-  if (result.isFullBuild) {
-    await backfillNativeDroppedFiles(ctx);
-  }
+  // Runs on every successful orchestrator pass (not just full builds): on
+  // incrementals the orchestrator's change detection treats files outside
+  // Rust's narrower file_collector as `removed` and deletes their nodes +
+  // file_hashes rows. Without re-running the backfill we'd lose the symbols
+  // for those files and permanently break the JS-side fast-skip pre-flight
+  // (#1054, #1068). The function is cheap (single fs scan + DB query) when
+  // nothing is missing, and on no-op rebuilds the missing-set is re-derived
+  // from `nodes`, so it catches whatever Rust just deleted.
+  await backfillNativeDroppedFiles(ctx);
 
   closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
   return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
@@ -747,22 +757,40 @@ async function tryNativeOrchestrator(
  * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
  */
 async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
-  // Needs a real better-sqlite3 connection for INSERT.
-  if (ctx.nativeFirstProxy) {
-    closeNativeDb(ctx, 'pre-parity-backfill');
-    ctx.db = openDb(ctx.dbPath);
-    ctx.nativeFirstProxy = false;
-  }
-
+  // Compute the missing-file set FIRST, before any expensive DB handoff.
+  // NativeDbProxy supports .prepare().all(), so the upfront query works
+  // whether ctx.db is a proxy or a real better-sqlite3 connection. On
+  // incremental no-op rebuilds nothing is missing, so we want to early-return
+  // without paying the close-native / reopen-better-sqlite3 cost.
   const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set<string>());
   const expected = new Set(
     collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))),
   );
 
-  const existingRows = ctx.db
+  const existingNodeRows = ctx.db
     .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'")
     .all() as Array<{ file: string }>;
-  const existing = new Set(existingRows.map((r) => r.file));
+  const existingNodes = new Set(existingNodeRows.map((r) => r.file));
+
+  // Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight
+  // (#1054) rejects on `file_hashes` gaps, and the two tables can diverge
+  // (e.g. a DB written by old code where `nodes` was populated but
+  // `file_hashes` was not). Treating "in nodes but not in file_hashes" as
+  // missing closes the gap so the backfill repairs the file_hashes row even
+  // when the node row already exists.
+  let existingHashes = new Set<string>();
+  try {
+    const existingHashRows = ctx.db
+      .prepare('SELECT DISTINCT file FROM file_hashes')
+      .all() as Array<{ file: string }>;
+    existingHashes = new Set(existingHashRows.map((r) => r.file));
+  } catch (e) {
+    // file_hashes table may not exist on legacy DBs; treat as fully missing
+    // so the backfill writes rows on the upsert path below.
+    debug(
+      `backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`,
+    );
+  }
 
   // Restrict backfill to files with an installed WASM grammar. Extensions in
   // LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
@@ -772,14 +800,24 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
   const missingRel: string[] = [];
   const missingAbs: string[] = [];
   for (const rel of expected) {
-    if (existing.has(rel)) continue;
+    // A file is "missing" if it's absent from EITHER nodes OR file_hashes.
+    // Both must be present for fast-skip to work correctly.
+    if (existingNodes.has(rel) && existingHashes.has(rel)) continue;
     const ext = path.extname(rel).toLowerCase();
     if (!installedExts.has(ext)) continue;
     missingRel.push(rel);
     missingAbs.push(path.join(ctx.rootDir, rel));
   }
   if (missingAbs.length === 0) return;
 
+  // Now that we know there's work to do, hand off to better-sqlite3 (needed
+  // for the INSERT path below).
+  if (ctx.nativeFirstProxy) {
+    closeNativeDb(ctx, 'pre-parity-backfill');
+    ctx.db = openDb(ctx.dbPath);
+    ctx.nativeFirstProxy = false;
+  }
+
   // Classify drops so users see per-extension reasons instead of just a count
   // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
   // extractor); `native-extractor-failure` indicates a real native bug since
@@ -856,6 +894,47 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
     }
   }
 
+  // Persist file_hashes rows for every backfilled file. The Rust orchestrator
+  // only hashes files it parsed itself, so without this step files in
+  // optional-language extensions (e.g. .clj when no Rust extractor exists)
+  // would be missing from `file_hashes` — permanently breaking the JS-side
+  // fast-skip pre-flight (#1054), which rejects on `collected file missing
+  // from file_hashes` and forces every no-op rebuild back through the full
+  // ~2s native pipeline (#1068).
+  //
+  // Iterates `missingRel` (every collected file the Rust orchestrator
+  // dropped), not `wasmResults`, so files that produced zero symbols still
+  // get a row.
+  try {
+    const upsertHash = db.prepare(
+      'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)',
+    );
+    const writeHashes = db.transaction(() => {
+      for (let i = 0; i < missingRel.length; i++) {
+        const relPath = missingRel[i];
+        const absPath = missingAbs[i];
+        if (!relPath || !absPath) continue;
+        let code: string | null;
+        try {
+          code = readFileSafe(absPath);
+        } catch (e) {
+          debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`);
+          continue;
+        }
+        if (code === null) continue;
+        const stat = fileStat(absPath);
+        const mtime = stat ? Math.floor(stat.mtimeMs) : 0;
+        const size = stat ? stat.size : 0;
+        upsertHash.run(relPath, fileHash(code), mtime, size);
+      }
+    });
+    writeHashes();
+  } catch (e) {
+    debug(
+      `backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`,
+    );
+  }
+
   // Free WASM parse trees from the inline backfill path (#1058).
   // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree
   // backed by WASM linear memory) on every result, but these symbols are

diff --git a/src/domain/graph/builder/stages/insert-nodes.ts b/src/domain/graph/builder/stages/insert-nodes.ts
@@ -12,10 +12,12 @@ import path from 'node:path';
 import { performance } from 'node:perf_hooks';
 import { bulkNodeIdsByFile } from '../../../../db/index.js';
 import { debug } from '../../../../infrastructure/logger.js';
+import { normalizePath } from '../../../../shared/constants.js';
 import { toErrorMessage } from '../../../../shared/errors.js';
 import type {
   BetterSqlite3Database,
   ExtractorOutput,
+  FileToParse,
   MetadataUpdate,
   SqliteStatement,
 } from '../../../../types.js';
@@ -90,16 +92,30 @@ function marshalSymbolBatches(allSymbols: Map<string, ExtractorOutput>): InsertN
   return batches;
 }
 
-/** Build file hash entries from parsed symbols and precomputed/metadata sources. */
-function buildFileHashes(
-  allSymbols: Map<string, ExtractorOutput>,
+/**
+ * Build file hash entries for every collected file, including those that
+ * produced zero symbols (empty files, parsers that silently no-op'd, or
+ * optional-language extensions whose grammar wasn't installed). Iterating the
+ * symbol map instead would skip such files and leave them missing from
+ * `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on
+ * any subsequent no-op rebuild (#1068).
+ *
+ * Exported for unit testing.
+ */
+export function buildFileHashes(
+  filesToParse: FileToParse[],
   precomputedData: Map<string, PrecomputedFileData>,
   metadataUpdates: MetadataUpdate[],
   rootDir: string,
 ): Array<{ file: string; hash: string; mtime: number; size: number }> {
   const fileHashes: Array<{ file: string; hash: string; mtime: number; size: number }> = [];
+  const seen = new Set<string>();
+
+  for (const item of filesToParse) {
+    const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file));
+    if (seen.has(relPath)) continue;
+    seen.add(relPath);
 
-  for (const [relPath] of allSymbols) {
     const precomputed = precomputedData.get(relPath);
     if (precomputed?._reverseDepOnly) {
       continue; // file unchanged, hash already correct
@@ -157,7 +173,7 @@ function tryNativeInsert(ctx: PipelineContext): boolean {
   for (const item of filesToParse) {
     if (item.relPath) precomputedData.set(item.relPath, item as PrecomputedFileData);
   }
-  const fileHashes = buildFileHashes(allSymbols, precomputedData, metadataUpdates, rootDir);
+  const fileHashes = buildFileHashes(filesToParse, precomputedData, metadataUpdates, rootDir);
 
   // In native-first mode (single rusqlite connection), no WAL dance is needed.
   // In dual-connection mode, checkpoint JS side before native write, then
@@ -321,15 +337,23 @@ function insertChildrenAndEdges(
 
 function updateFileHashes(
   _db: BetterSqlite3Database,
-  allSymbols: Map<string, ExtractorOutput>,
+  filesToParse: FileToParse[],
   precomputedData: Map<string, PrecomputedFileData>,
   metadataUpdates: MetadataUpdate[],
   rootDir: string,
   upsertHash: SqliteStatement | null,
 ): void {
   if (!upsertHash) return;
 
-  for (const [relPath] of allSymbols) {
+  // Iterate every collected file (#1068): files that produced zero symbols
+  // (empty, parser no-op, or grammar-missing optional language) still need a
+  // hash row, otherwise the next no-op rebuild's fast-skip pre-flight rejects.
+  const seen = new Set<string>();
+  for (const item of filesToParse) {
+    const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file));
+    if (seen.has(relPath)) continue;
+    seen.add(relPath);
+
     const precomputed = precomputedData.get(relPath);
     if (precomputed?._reverseDepOnly) {
       // no-op: file unchanged, hash already correct
@@ -415,7 +439,7 @@ export async function insertNodes(ctx: PipelineContext): Promise<void> {
   const insertAll = ctx.db.transaction(() => {
     insertDefinitionsAndExports(ctx.db, allSymbols);
     insertChildrenAndEdges(ctx.db, allSymbols);
-    updateFileHashes(ctx.db, allSymbols, precomputedData, metadataUpdates, rootDir, upsertHash);
+    updateFileHashes(ctx.db, filesToParse, precomputedData, metadataUpdates, rootDir, upsertHash);
   });
 
   insertAll();

diff --git a/tests/builder/insert-nodes.test.ts b/tests/builder/insert-nodes.test.ts
@@ -0,0 +1,123 @@
+/**
+ * Unit tests for insertNodes helpers.
+ *
+ * Regression coverage for #1068: the file-hash builder must emit a row for
+ * every collected file, even those whose parser produced zero symbols (empty
+ * files, parser no-op, or optional-language grammar unavailable). Skipping
+ * symbol-less files would leave the next no-op rebuild's fast-skip pre-flight
+ * (#1054) rejecting on `collected file missing from file_hashes` and force
+ * the full ~2s native pipeline.
+ */
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import { afterAll, beforeAll, describe, expect, it } from 'vitest';
+import { fileHash } from '../../src/domain/graph/builder/helpers.js';
+import { buildFileHashes } from '../../src/domain/graph/builder/stages/insert-nodes.js';
+
+let tmpDir: string;
+
+beforeAll(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-insert-nodes-'));
+  fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 1;');
+  // Symbol-less file (e.g. registered extension whose grammar wasn't installed,
+  // or a file the parser silently no-op'd on). Content is arbitrary — the
+  // hash builder must not care whether parsing produced any symbols.
+  fs.writeFileSync(path.join(tmpDir, 'b.clj'), '(comment "no symbols")');
+});
+
+afterAll(() => {
+  if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true });
+});
+
+describe('buildFileHashes', () => {
+  it('emits a row for every collected file, including symbol-less ones (#1068)', () => {
+    const filesToParse = [
+      { file: path.join(tmpDir, 'a.js') },
+      { file: path.join(tmpDir, 'b.clj') },
+    ];
+    const result = buildFileHashes(filesToParse, new Map(), [], tmpDir);
+
+    const files = result.map((r) => r.file).sort();
+    expect(files).toEqual(['a.js', 'b.clj']);
+    for (const row of result) {
+      expect(row.hash).toMatch(/^[0-9a-f]+$/);
+      expect(row.size).toBeGreaterThan(0);
+      expect(row.mtime).toBeGreaterThan(0);
+    }
+  });
+
+  it('uses precomputed hash when present', () => {
+    const aPath = path.join(tmpDir, 'a.js');
+    const precomputedHash = 'deadbeef';
+    const precomputed = new Map([
+      [
+        'a.js',
+        {
+          file: aPath,
+          relPath: 'a.js',
+          hash: precomputedHash,
+          stat: { mtime: 12345, size: 99 },
+        },
+      ],
+    ]);
+    const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir);
+
+    expect(result).toEqual([{ file: 'a.js', hash: precomputedHash, mtime: 12345, size: 99 }]);
+  });
+
+  it('skips files marked _reverseDepOnly (hash already correct)', () => {
+    const aPath = path.join(tmpDir, 'a.js');
+    const precomputed = new Map([
+      [
+        'a.js',
+        {
+          file: aPath,
+          relPath: 'a.js',
+          hash: 'unused',
+          _reverseDepOnly: true,
+        },
+      ],
+    ]);
+    const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir);
+
+    expect(result).toEqual([]);
+  });
+
+  it('falls back to reading file from disk when no precomputed data exists', () => {
+    const aPath = path.join(tmpDir, 'a.js');
+    const result = buildFileHashes([{ file: aPath }], new Map(), [], tmpDir);
+
+    expect(result).toHaveLength(1);
+    const row = result[0]!;
+    expect(row.file).toBe('a.js');
+    expect(row.hash).toBe(fileHash(fs.readFileSync(aPath, 'utf-8')));
+  });
+
+  it('appends metadata-only updates after the file iteration', () => {
+    const result = buildFileHashes(
+      [],
+      new Map(),
+      [{ relPath: 'meta.js', hash: 'abc', stat: { mtime: 10, size: 20 } }],
+      tmpDir,
+    );
+
+    expect(result).toEqual([{ file: 'meta.js', hash: 'abc', mtime: 10, size: 20 }]);
+  });
+
+  it('deduplicates when filesToParse contains the same relPath twice', () => {
+    const aPath = path.join(tmpDir, 'a.js');
+    const result = buildFileHashes(
+      [
+        { file: aPath, relPath: 'a.js' },
+        { file: aPath, relPath: 'a.js' },
+      ],
+      new Map(),
+      [],
+      tmpDir,
+    );
+
+    expect(result).toHaveLength(1);
+    expect(result[0]!.file).toBe('a.js');
+  });
+});