Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 96 additions & 17 deletions src/domain/graph/builder/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,14 @@ import {
import { writeJournalHeader } from '../journal.js';
import { setWorkspaces } from '../resolve.js';
import { PipelineContext } from './context.js';
import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
import {
batchInsertNodes,
collectFiles as collectFilesUtil,
fileHash,
fileStat,
loadPathAliases,
readFileSafe,
} from './helpers.js';
import { NativeDbProxy } from './native-db-proxy.js';
import { buildEdges } from './stages/build-edges.js';
import { buildStructure } from './stages/build-structure.js';
Expand Down Expand Up @@ -731,12 +738,15 @@ async function tryNativeOrchestrator(
// stale native binaries). WASM handles those — backfill via WASM so both
// engines process the same file set (#967).
//
// Only runs on full builds: incremental builds only touch changed files,
// which are parsed through parseFilesAuto (which has its own per-file
// backfill), so a full filesystem scan here would be wasted work.
if (result.isFullBuild) {
await backfillNativeDroppedFiles(ctx);
}
// Runs on every successful orchestrator pass (not just full builds): on
// incrementals the orchestrator's change detection treats files outside
// Rust's narrower file_collector as `removed` and deletes their nodes +
// file_hashes rows. Without re-running the backfill we'd lose the symbols
// for those files and permanently break the JS-side fast-skip pre-flight
// (#1054, #1068). The function is cheap (single fs scan + DB query) when
// nothing is missing, and on no-op rebuilds the missing-set is re-derived
// from `nodes`, so it catches whatever Rust just deleted.
await backfillNativeDroppedFiles(ctx);

closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
Expand All @@ -747,22 +757,40 @@ async function tryNativeOrchestrator(
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
*/
async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
// Needs a real better-sqlite3 connection for INSERT.
if (ctx.nativeFirstProxy) {
closeNativeDb(ctx, 'pre-parity-backfill');
ctx.db = openDb(ctx.dbPath);
ctx.nativeFirstProxy = false;
}

// Compute the missing-file set FIRST, before any expensive DB handoff.
// NativeDbProxy supports .prepare().all(), so the upfront query works
// whether ctx.db is a proxy or a real better-sqlite3 connection. On
// incremental no-op rebuilds nothing is missing, so we want to early-return
// without paying the close-native / reopen-better-sqlite3 cost.
const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set<string>());
const expected = new Set(
collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))),
);

const existingRows = ctx.db
const existingNodeRows = ctx.db
.prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'")
.all() as Array<{ file: string }>;
const existing = new Set(existingRows.map((r) => r.file));
const existingNodes = new Set(existingNodeRows.map((r) => r.file));

// Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight
// (#1054) rejects on `file_hashes` gaps, and the two tables can diverge
// (e.g. a DB written by old code where `nodes` was populated but
// `file_hashes` was not). Treating "in nodes but not in file_hashes" as
// missing closes the gap so the backfill repairs the file_hashes row even
// when the node row already exists.
let existingHashes = new Set<string>();
try {
const existingHashRows = ctx.db
.prepare('SELECT DISTINCT file FROM file_hashes')
.all() as Array<{ file: string }>;
existingHashes = new Set(existingHashRows.map((r) => r.file));
} catch (e) {
// file_hashes table may not exist on legacy DBs; treat as fully missing
// so the backfill writes rows on the upsert path below.
debug(
`backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`,
);
}

// Restrict backfill to files with an installed WASM grammar. Extensions in
// LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
Expand All @@ -772,14 +800,24 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
const missingRel: string[] = [];
const missingAbs: string[] = [];
for (const rel of expected) {
if (existing.has(rel)) continue;
// A file is "missing" if it's absent from EITHER nodes OR file_hashes.
// Both must be present for fast-skip to work correctly.
if (existingNodes.has(rel) && existingHashes.has(rel)) continue;
const ext = path.extname(rel).toLowerCase();
if (!installedExts.has(ext)) continue;
missingRel.push(rel);
missingAbs.push(path.join(ctx.rootDir, rel));
}
if (missingAbs.length === 0) return;

// Now that we know there's work to do, hand off to better-sqlite3 (needed
// for the INSERT path below).
if (ctx.nativeFirstProxy) {
closeNativeDb(ctx, 'pre-parity-backfill');
ctx.db = openDb(ctx.dbPath);
ctx.nativeFirstProxy = false;
}

// Classify drops so users see per-extension reasons instead of just a count
// (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
// extractor); `native-extractor-failure` indicates a real native bug since
Expand Down Expand Up @@ -856,6 +894,47 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
}
}

// Persist file_hashes rows for every backfilled file. The Rust orchestrator
// only hashes files it parsed itself, so without this step files in
// optional-language extensions (e.g. .clj when no Rust extractor exists)
// would be missing from `file_hashes` — permanently breaking the JS-side
// fast-skip pre-flight (#1054), which rejects on `collected file missing
// from file_hashes` and forces every no-op rebuild back through the full
// ~2s native pipeline (#1068).
//
// Iterates `missingRel` (every collected file the Rust orchestrator
// dropped), not `wasmResults`, so files that produced zero symbols still
// get a row.
try {
const upsertHash = db.prepare(
'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)',
);
const writeHashes = db.transaction(() => {
for (let i = 0; i < missingRel.length; i++) {
const relPath = missingRel[i];
const absPath = missingAbs[i];
if (!relPath || !absPath) continue;
let code: string | null;
try {
code = readFileSafe(absPath);
} catch (e) {
debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`);
continue;
}
if (code === null) continue;
const stat = fileStat(absPath);
const mtime = stat ? Math.floor(stat.mtimeMs) : 0;
const size = stat ? stat.size : 0;
upsertHash.run(relPath, fileHash(code), mtime, size);
}
});
writeHashes();
} catch (e) {
debug(
`backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`,
);
}

// Free WASM parse trees from the inline backfill path (#1058).
// `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree
// backed by WASM linear memory) on every result, but these symbols are
Expand Down
40 changes: 32 additions & 8 deletions src/domain/graph/builder/stages/insert-nodes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ import path from 'node:path';
import { performance } from 'node:perf_hooks';
import { bulkNodeIdsByFile } from '../../../../db/index.js';
import { debug } from '../../../../infrastructure/logger.js';
import { normalizePath } from '../../../../shared/constants.js';
import { toErrorMessage } from '../../../../shared/errors.js';
import type {
BetterSqlite3Database,
ExtractorOutput,
FileToParse,
MetadataUpdate,
SqliteStatement,
} from '../../../../types.js';
Expand Down Expand Up @@ -90,16 +92,30 @@ function marshalSymbolBatches(allSymbols: Map<string, ExtractorOutput>): InsertN
return batches;
}

/** Build file hash entries from parsed symbols and precomputed/metadata sources. */
function buildFileHashes(
allSymbols: Map<string, ExtractorOutput>,
/**
* Build file hash entries for every collected file, including those that
* produced zero symbols (empty files, parsers that silently no-op'd, or
* optional-language extensions whose grammar wasn't installed). Iterating the
* symbol map instead would skip such files and leave them missing from
* `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on
* any subsequent no-op rebuild (#1068).
*
* Exported for unit testing.
*/
export function buildFileHashes(
filesToParse: FileToParse[],
precomputedData: Map<string, PrecomputedFileData>,
metadataUpdates: MetadataUpdate[],
rootDir: string,
): Array<{ file: string; hash: string; mtime: number; size: number }> {
const fileHashes: Array<{ file: string; hash: string; mtime: number; size: number }> = [];
const seen = new Set<string>();

for (const item of filesToParse) {
const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file));
if (seen.has(relPath)) continue;
seen.add(relPath);

for (const [relPath] of allSymbols) {
const precomputed = precomputedData.get(relPath);
if (precomputed?._reverseDepOnly) {
continue; // file unchanged, hash already correct
Expand Down Expand Up @@ -157,7 +173,7 @@ function tryNativeInsert(ctx: PipelineContext): boolean {
for (const item of filesToParse) {
if (item.relPath) precomputedData.set(item.relPath, item as PrecomputedFileData);
}
const fileHashes = buildFileHashes(allSymbols, precomputedData, metadataUpdates, rootDir);
const fileHashes = buildFileHashes(filesToParse, precomputedData, metadataUpdates, rootDir);

// In native-first mode (single rusqlite connection), no WAL dance is needed.
// In dual-connection mode, checkpoint JS side before native write, then
Expand Down Expand Up @@ -321,15 +337,23 @@ function insertChildrenAndEdges(

function updateFileHashes(
_db: BetterSqlite3Database,
allSymbols: Map<string, ExtractorOutput>,
filesToParse: FileToParse[],
precomputedData: Map<string, PrecomputedFileData>,
metadataUpdates: MetadataUpdate[],
rootDir: string,
upsertHash: SqliteStatement | null,
): void {
if (!upsertHash) return;

for (const [relPath] of allSymbols) {
// Iterate every collected file (#1068): files that produced zero symbols
// (empty, parser no-op, or grammar-missing optional language) still need a
// hash row, otherwise the next no-op rebuild's fast-skip pre-flight rejects.
const seen = new Set<string>();
for (const item of filesToParse) {
const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file));
if (seen.has(relPath)) continue;
seen.add(relPath);

const precomputed = precomputedData.get(relPath);
if (precomputed?._reverseDepOnly) {
// no-op: file unchanged, hash already correct
Expand Down Expand Up @@ -415,7 +439,7 @@ export async function insertNodes(ctx: PipelineContext): Promise<void> {
const insertAll = ctx.db.transaction(() => {
insertDefinitionsAndExports(ctx.db, allSymbols);
insertChildrenAndEdges(ctx.db, allSymbols);
updateFileHashes(ctx.db, allSymbols, precomputedData, metadataUpdates, rootDir, upsertHash);
updateFileHashes(ctx.db, filesToParse, precomputedData, metadataUpdates, rootDir, upsertHash);
});

insertAll();
Expand Down
123 changes: 123 additions & 0 deletions tests/builder/insert-nodes.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/**
* Unit tests for insertNodes helpers.
*
* Regression coverage for #1068: the file-hash builder must emit a row for
* every collected file, even those whose parser produced zero symbols (empty
* files, parser no-op, or optional-language grammar unavailable). Skipping
* symbol-less files would leave the next no-op rebuild's fast-skip pre-flight
* (#1054) rejecting on `collected file missing from file_hashes` and force
* the full ~2s native pipeline.
*/
import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import { afterAll, beforeAll, describe, expect, it } from 'vitest';
import { fileHash } from '../../src/domain/graph/builder/helpers.js';
import { buildFileHashes } from '../../src/domain/graph/builder/stages/insert-nodes.js';

let tmpDir: string;

beforeAll(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-insert-nodes-'));
fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 1;');
// Symbol-less file (e.g. registered extension whose grammar wasn't installed,
// or a file the parser silently no-op'd on). Content is arbitrary — the
// hash builder must not care whether parsing produced any symbols.
fs.writeFileSync(path.join(tmpDir, 'b.clj'), '(comment "no symbols")');
});

afterAll(() => {
if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true });
});

describe('buildFileHashes', () => {
it('emits a row for every collected file, including symbol-less ones (#1068)', () => {
const filesToParse = [
{ file: path.join(tmpDir, 'a.js') },
{ file: path.join(tmpDir, 'b.clj') },
];
const result = buildFileHashes(filesToParse, new Map(), [], tmpDir);

const files = result.map((r) => r.file).sort();
expect(files).toEqual(['a.js', 'b.clj']);
for (const row of result) {
expect(row.hash).toMatch(/^[0-9a-f]+$/);
expect(row.size).toBeGreaterThan(0);
expect(row.mtime).toBeGreaterThan(0);
}
});

it('uses precomputed hash when present', () => {
const aPath = path.join(tmpDir, 'a.js');
const precomputedHash = 'deadbeef';
const precomputed = new Map([
[
'a.js',
{
file: aPath,
relPath: 'a.js',
hash: precomputedHash,
stat: { mtime: 12345, size: 99 },
},
],
]);
const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir);

expect(result).toEqual([{ file: 'a.js', hash: precomputedHash, mtime: 12345, size: 99 }]);
});

it('skips files marked _reverseDepOnly (hash already correct)', () => {
const aPath = path.join(tmpDir, 'a.js');
const precomputed = new Map([
[
'a.js',
{
file: aPath,
relPath: 'a.js',
hash: 'unused',
_reverseDepOnly: true,
},
],
]);
const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir);

expect(result).toEqual([]);
});

it('falls back to reading file from disk when no precomputed data exists', () => {
const aPath = path.join(tmpDir, 'a.js');
const result = buildFileHashes([{ file: aPath }], new Map(), [], tmpDir);

expect(result).toHaveLength(1);
const row = result[0]!;
expect(row.file).toBe('a.js');
expect(row.hash).toBe(fileHash(fs.readFileSync(aPath, 'utf-8')));
});

it('appends metadata-only updates after the file iteration', () => {
const result = buildFileHashes(
[],
new Map(),
[{ relPath: 'meta.js', hash: 'abc', stat: { mtime: 10, size: 20 } }],
tmpDir,
);

expect(result).toEqual([{ file: 'meta.js', hash: 'abc', mtime: 10, size: 20 }]);
});

it('deduplicates when filesToParse contains the same relPath twice', () => {
const aPath = path.join(tmpDir, 'a.js');
const result = buildFileHashes(
[
{ file: aPath, relPath: 'a.js' },
{ file: aPath, relPath: 'a.js' },
],
new Map(),
[],
tmpDir,
);

expect(result).toHaveLength(1);
expect(result[0]!.file).toBe('a.js');
});
});
Loading