diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index 767233058..6c37e3d10 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -43,7 +43,14 @@ import { import { writeJournalHeader } from '../journal.js'; import { setWorkspaces } from '../resolve.js'; import { PipelineContext } from './context.js'; -import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js'; +import { + batchInsertNodes, + collectFiles as collectFilesUtil, + fileHash, + fileStat, + loadPathAliases, + readFileSafe, +} from './helpers.js'; import { NativeDbProxy } from './native-db-proxy.js'; import { buildEdges } from './stages/build-edges.js'; import { buildStructure } from './stages/build-structure.js'; @@ -731,12 +738,15 @@ async function tryNativeOrchestrator( // stale native binaries). WASM handles those — backfill via WASM so both // engines process the same file set (#967). // - // Only runs on full builds: incremental builds only touch changed files, - // which are parsed through parseFilesAuto (which has its own per-file - // backfill), so a full filesystem scan here would be wasted work. - if (result.isFullBuild) { - await backfillNativeDroppedFiles(ctx); - } + // Runs on every successful orchestrator pass (not just full builds): on + // incrementals the orchestrator's change detection treats files outside + // Rust's narrower file_collector as `removed` and deletes their nodes + + // file_hashes rows. Without re-running the backfill we'd lose the symbols + // for those files and permanently break the JS-side fast-skip pre-flight + // (#1054, #1068). The function is cheap (single fs scan + DB query) when + // nothing is missing, and on no-op rebuilds the missing-set is re-derived + // from `nodes`, so it catches whatever Rust just deleted. + await backfillNativeDroppedFiles(ctx); closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); return formatNativeTimingResult(p, structurePatchMs, analysisTiming); @@ -747,22 +757,40 @@ async function tryNativeOrchestrator( * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). */ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { - // Needs a real better-sqlite3 connection for INSERT. - if (ctx.nativeFirstProxy) { - closeNativeDb(ctx, 'pre-parity-backfill'); - ctx.db = openDb(ctx.dbPath); - ctx.nativeFirstProxy = false; - } - + // Compute the missing-file set FIRST, before any expensive DB handoff. + // NativeDbProxy supports .prepare().all(), so the upfront query works + // whether ctx.db is a proxy or a real better-sqlite3 connection. On + // incremental no-op rebuilds nothing is missing, so we want to early-return + // without paying the close-native / reopen-better-sqlite3 cost. const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); const expected = new Set( collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), ); - const existingRows = ctx.db + const existingNodeRows = ctx.db .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'") .all() as Array<{ file: string }>; - const existing = new Set(existingRows.map((r) => r.file)); + const existingNodes = new Set(existingNodeRows.map((r) => r.file)); + + // Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight + // (#1054) rejects on `file_hashes` gaps, and the two tables can diverge + // (e.g. a DB written by old code where `nodes` was populated but + // `file_hashes` was not). Treating "in nodes but not in file_hashes" as + // missing closes the gap so the backfill repairs the file_hashes row even + // when the node row already exists. + let existingHashes = new Set(); + try { + const existingHashRows = ctx.db + .prepare('SELECT DISTINCT file FROM file_hashes') + .all() as Array<{ file: string }>; + existingHashes = new Set(existingHashRows.map((r) => r.file)); + } catch (e) { + // file_hashes table may not exist on legacy DBs; treat as fully missing + // so the backfill writes rows on the upsert path below. + debug( + `backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, + ); + } // Restrict backfill to files with an installed WASM grammar. Extensions in // LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on @@ -772,7 +800,9 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { const missingRel: string[] = []; const missingAbs: string[] = []; for (const rel of expected) { - if (existing.has(rel)) continue; + // A file is "missing" if it's absent from EITHER nodes OR file_hashes. + // Both must be present for fast-skip to work correctly. + if (existingNodes.has(rel) && existingHashes.has(rel)) continue; const ext = path.extname(rel).toLowerCase(); if (!installedExts.has(ext)) continue; missingRel.push(rel); @@ -780,6 +810,14 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { } if (missingAbs.length === 0) return; + // Now that we know there's work to do, hand off to better-sqlite3 (needed + // for the INSERT path below). + if (ctx.nativeFirstProxy) { + closeNativeDb(ctx, 'pre-parity-backfill'); + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } + // Classify drops so users see per-extension reasons instead of just a count // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust // extractor); `native-extractor-failure` indicates a real native bug since @@ -856,6 +894,47 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { } } + // Persist file_hashes rows for every backfilled file. The Rust orchestrator + // only hashes files it parsed itself, so without this step files in + // optional-language extensions (e.g. .clj when no Rust extractor exists) + // would be missing from `file_hashes` — permanently breaking the JS-side + // fast-skip pre-flight (#1054), which rejects on `collected file missing + // from file_hashes` and forces every no-op rebuild back through the full + // ~2s native pipeline (#1068). + // + // Iterates `missingRel` (every collected file the Rust orchestrator + // dropped), not `wasmResults`, so files that produced zero symbols still + // get a row. + try { + const upsertHash = db.prepare( + 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', + ); + const writeHashes = db.transaction(() => { + for (let i = 0; i < missingRel.length; i++) { + const relPath = missingRel[i]; + const absPath = missingAbs[i]; + if (!relPath || !absPath) continue; + let code: string | null; + try { + code = readFileSafe(absPath); + } catch (e) { + debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`); + continue; + } + if (code === null) continue; + const stat = fileStat(absPath); + const mtime = stat ? Math.floor(stat.mtimeMs) : 0; + const size = stat ? stat.size : 0; + upsertHash.run(relPath, fileHash(code), mtime, size); + } + }); + writeHashes(); + } catch (e) { + debug( + `backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`, + ); + } + // Free WASM parse trees from the inline backfill path (#1058). // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree // backed by WASM linear memory) on every result, but these symbols are diff --git a/src/domain/graph/builder/stages/insert-nodes.ts b/src/domain/graph/builder/stages/insert-nodes.ts index f069411b5..339b11237 100644 --- a/src/domain/graph/builder/stages/insert-nodes.ts +++ b/src/domain/graph/builder/stages/insert-nodes.ts @@ -12,10 +12,12 @@ import path from 'node:path'; import { performance } from 'node:perf_hooks'; import { bulkNodeIdsByFile } from '../../../../db/index.js'; import { debug } from '../../../../infrastructure/logger.js'; +import { normalizePath } from '../../../../shared/constants.js'; import { toErrorMessage } from '../../../../shared/errors.js'; import type { BetterSqlite3Database, ExtractorOutput, + FileToParse, MetadataUpdate, SqliteStatement, } from '../../../../types.js'; @@ -90,16 +92,30 @@ function marshalSymbolBatches(allSymbols: Map): InsertN return batches; } -/** Build file hash entries from parsed symbols and precomputed/metadata sources. */ -function buildFileHashes( - allSymbols: Map, +/** + * Build file hash entries for every collected file, including those that + * produced zero symbols (empty files, parsers that silently no-op'd, or + * optional-language extensions whose grammar wasn't installed). Iterating the + * symbol map instead would skip such files and leave them missing from + * `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on + * any subsequent no-op rebuild (#1068). + * + * Exported for unit testing. + */ +export function buildFileHashes( + filesToParse: FileToParse[], precomputedData: Map, metadataUpdates: MetadataUpdate[], rootDir: string, ): Array<{ file: string; hash: string; mtime: number; size: number }> { const fileHashes: Array<{ file: string; hash: string; mtime: number; size: number }> = []; + const seen = new Set(); + + for (const item of filesToParse) { + const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file)); + if (seen.has(relPath)) continue; + seen.add(relPath); - for (const [relPath] of allSymbols) { const precomputed = precomputedData.get(relPath); if (precomputed?._reverseDepOnly) { continue; // file unchanged, hash already correct @@ -157,7 +173,7 @@ function tryNativeInsert(ctx: PipelineContext): boolean { for (const item of filesToParse) { if (item.relPath) precomputedData.set(item.relPath, item as PrecomputedFileData); } - const fileHashes = buildFileHashes(allSymbols, precomputedData, metadataUpdates, rootDir); + const fileHashes = buildFileHashes(filesToParse, precomputedData, metadataUpdates, rootDir); // In native-first mode (single rusqlite connection), no WAL dance is needed. // In dual-connection mode, checkpoint JS side before native write, then @@ -321,7 +337,7 @@ function insertChildrenAndEdges( function updateFileHashes( _db: BetterSqlite3Database, - allSymbols: Map, + filesToParse: FileToParse[], precomputedData: Map, metadataUpdates: MetadataUpdate[], rootDir: string, @@ -329,7 +345,15 @@ function updateFileHashes( ): void { if (!upsertHash) return; - for (const [relPath] of allSymbols) { + // Iterate every collected file (#1068): files that produced zero symbols + // (empty, parser no-op, or grammar-missing optional language) still need a + // hash row, otherwise the next no-op rebuild's fast-skip pre-flight rejects. + const seen = new Set(); + for (const item of filesToParse) { + const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file)); + if (seen.has(relPath)) continue; + seen.add(relPath); + const precomputed = precomputedData.get(relPath); if (precomputed?._reverseDepOnly) { // no-op: file unchanged, hash already correct @@ -415,7 +439,7 @@ export async function insertNodes(ctx: PipelineContext): Promise { const insertAll = ctx.db.transaction(() => { insertDefinitionsAndExports(ctx.db, allSymbols); insertChildrenAndEdges(ctx.db, allSymbols); - updateFileHashes(ctx.db, allSymbols, precomputedData, metadataUpdates, rootDir, upsertHash); + updateFileHashes(ctx.db, filesToParse, precomputedData, metadataUpdates, rootDir, upsertHash); }); insertAll(); diff --git a/tests/builder/insert-nodes.test.ts b/tests/builder/insert-nodes.test.ts new file mode 100644 index 000000000..ec6338785 --- /dev/null +++ b/tests/builder/insert-nodes.test.ts @@ -0,0 +1,123 @@ +/** + * Unit tests for insertNodes helpers. + * + * Regression coverage for #1068: the file-hash builder must emit a row for + * every collected file, even those whose parser produced zero symbols (empty + * files, parser no-op, or optional-language grammar unavailable). Skipping + * symbol-less files would leave the next no-op rebuild's fast-skip pre-flight + * (#1054) rejecting on `collected file missing from file_hashes` and force + * the full ~2s native pipeline. + */ +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { fileHash } from '../../src/domain/graph/builder/helpers.js'; +import { buildFileHashes } from '../../src/domain/graph/builder/stages/insert-nodes.js'; + +let tmpDir: string; + +beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-insert-nodes-')); + fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 1;'); + // Symbol-less file (e.g. registered extension whose grammar wasn't installed, + // or a file the parser silently no-op'd on). Content is arbitrary — the + // hash builder must not care whether parsing produced any symbols. + fs.writeFileSync(path.join(tmpDir, 'b.clj'), '(comment "no symbols")'); +}); + +afterAll(() => { + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +describe('buildFileHashes', () => { + it('emits a row for every collected file, including symbol-less ones (#1068)', () => { + const filesToParse = [ + { file: path.join(tmpDir, 'a.js') }, + { file: path.join(tmpDir, 'b.clj') }, + ]; + const result = buildFileHashes(filesToParse, new Map(), [], tmpDir); + + const files = result.map((r) => r.file).sort(); + expect(files).toEqual(['a.js', 'b.clj']); + for (const row of result) { + expect(row.hash).toMatch(/^[0-9a-f]+$/); + expect(row.size).toBeGreaterThan(0); + expect(row.mtime).toBeGreaterThan(0); + } + }); + + it('uses precomputed hash when present', () => { + const aPath = path.join(tmpDir, 'a.js'); + const precomputedHash = 'deadbeef'; + const precomputed = new Map([ + [ + 'a.js', + { + file: aPath, + relPath: 'a.js', + hash: precomputedHash, + stat: { mtime: 12345, size: 99 }, + }, + ], + ]); + const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir); + + expect(result).toEqual([{ file: 'a.js', hash: precomputedHash, mtime: 12345, size: 99 }]); + }); + + it('skips files marked _reverseDepOnly (hash already correct)', () => { + const aPath = path.join(tmpDir, 'a.js'); + const precomputed = new Map([ + [ + 'a.js', + { + file: aPath, + relPath: 'a.js', + hash: 'unused', + _reverseDepOnly: true, + }, + ], + ]); + const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir); + + expect(result).toEqual([]); + }); + + it('falls back to reading file from disk when no precomputed data exists', () => { + const aPath = path.join(tmpDir, 'a.js'); + const result = buildFileHashes([{ file: aPath }], new Map(), [], tmpDir); + + expect(result).toHaveLength(1); + const row = result[0]!; + expect(row.file).toBe('a.js'); + expect(row.hash).toBe(fileHash(fs.readFileSync(aPath, 'utf-8'))); + }); + + it('appends metadata-only updates after the file iteration', () => { + const result = buildFileHashes( + [], + new Map(), + [{ relPath: 'meta.js', hash: 'abc', stat: { mtime: 10, size: 20 } }], + tmpDir, + ); + + expect(result).toEqual([{ file: 'meta.js', hash: 'abc', mtime: 10, size: 20 }]); + }); + + it('deduplicates when filesToParse contains the same relPath twice', () => { + const aPath = path.join(tmpDir, 'a.js'); + const result = buildFileHashes( + [ + { file: aPath, relPath: 'a.js' }, + { file: aPath, relPath: 'a.js' }, + ], + new Map(), + [], + tmpDir, + ); + + expect(result).toHaveLength(1); + expect(result[0]!.file).toBe('a.js'); + }); +});