diff --git a/src/domain/graph/builder/helpers.ts b/src/domain/graph/builder/helpers.ts index c6cbd4845..4b3665a5d 100644 --- a/src/domain/graph/builder/helpers.ts +++ b/src/domain/graph/builder/helpers.ts @@ -76,108 +76,117 @@ export function passesIncludeExclude( return true; } +/** Per-walk state computed once at the top-level invocation. */ +interface CollectContext { + readonly rootDir: string; + readonly includeRegexes: readonly RegExp[]; + readonly excludeRegexes: readonly RegExp[]; + readonly hasGlobFilters: boolean; + readonly extraIgnore: Set | null; + readonly visited: Set; +} + +/** Detect a symlink loop for `dir`. Returns true if `dir` was already visited. */ +function isSymlinkLoop(dir: string, visited: Set): boolean { + let realDir: string; + try { + realDir = fs.realpathSync(dir); + } catch { + return true; + } + if (visited.has(realDir)) { + warn(`Symlink loop detected, skipping: ${dir}`); + return true; + } + visited.add(realDir); + return false; +} + +/** Read directory entries, returning null on error (already logged). */ +function readDirSafe(dir: string): fs.Dirent[] | null { + try { + return fs.readdirSync(dir, { withFileTypes: true }); + } catch (err: unknown) { + warn(`Cannot read directory ${dir}: ${(err as Error).message}`); + return null; + } +} + +/** True if `entry` is a source file we should collect under `ctx`. */ +function isCollectableSourceFile(full: string, entry: fs.Dirent, ctx: CollectContext): boolean { + if (!EXTENSIONS.has(path.extname(entry.name))) return false; + if (!ctx.hasGlobFilters) return true; + const rel = normalizePath(path.relative(ctx.rootDir, full)); + return passesIncludeExclude(rel, ctx.includeRegexes, ctx.excludeRegexes); +} + +function walkCollect( + dir: string, + files: string[], + directories: Set | null, + ctx: CollectContext, +): void { + if (isSymlinkLoop(dir, ctx.visited)) return; + + const entries = readDirSafe(dir); + if (!entries) return; + + let hasFiles = false; + for (const entry of entries) { + if (shouldSkipEntry(entry, ctx.extraIgnore)) continue; + + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + walkCollect(full, files, directories, ctx); + } else if (isCollectableSourceFile(full, entry, ctx)) { + files.push(full); + hasFiles = true; + } + } + if (directories && hasFiles) { + directories.add(dir); + } +} + /** * Recursively collect all source files under `dir`. * When `directories` is a Set, also tracks which directories contain files. * - * The first invocation establishes `dir` as the project root against which - * `config.include` / `config.exclude` globs are matched. + * `dir` establishes the project root against which `config.include` / + * `config.exclude` globs are matched. */ export function collectFiles( dir: string, files: string[], config: Partial, directories: Set, - _visited?: Set, - _rootDir?: string, - _includeRegexes?: readonly RegExp[], - _excludeRegexes?: readonly RegExp[], ): { files: string[]; directories: Set }; export function collectFiles( dir: string, files?: string[], config?: Partial, directories?: null, - _visited?: Set, - _rootDir?: string, - _includeRegexes?: readonly RegExp[], - _excludeRegexes?: readonly RegExp[], ): string[]; export function collectFiles( dir: string, files: string[] = [], config: Partial = {}, directories: Set | null = null, - _visited: Set = new Set(), - _rootDir?: string, - _includeRegexes?: readonly RegExp[], - _excludeRegexes?: readonly RegExp[], ): string[] | { files: string[]; directories: Set } { const trackDirs = directories instanceof Set; - let hasFiles = false; - - // First call: compute root and compile include/exclude patterns once, - // then pass them down recursive calls so we don't recompile per directory. - const rootDir = _rootDir ?? dir; - const includeRegexes = _includeRegexes ?? compileGlobs(config.include); - const excludeRegexes = _excludeRegexes ?? compileGlobs(config.exclude); - const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0; - - // Merge config ignoreDirs with defaults - const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; - - // Detect symlink loops (before I/O to avoid wasted readdirSync) - let realDir: string; - try { - realDir = fs.realpathSync(dir); - } catch { - return trackDirs ? { files, directories: directories as Set } : files; - } - if (_visited.has(realDir)) { - warn(`Symlink loop detected, skipping: ${dir}`); - return trackDirs ? { files, directories: directories as Set } : files; - } - _visited.add(realDir); - - let entries: fs.Dirent[]; - try { - entries = fs.readdirSync(dir, { withFileTypes: true }); - } catch (err: unknown) { - warn(`Cannot read directory ${dir}: ${(err as Error).message}`); - return trackDirs ? { files, directories: directories as Set } : files; - } + const includeRegexes = compileGlobs(config.include); + const excludeRegexes = compileGlobs(config.exclude); + const ctx: CollectContext = { + rootDir: dir, + includeRegexes, + excludeRegexes, + hasGlobFilters: includeRegexes.length > 0 || excludeRegexes.length > 0, + extraIgnore: config.ignoreDirs ? new Set(config.ignoreDirs) : null, + visited: new Set(), + }; - for (const entry of entries) { - if (shouldSkipEntry(entry, extraIgnore)) continue; + walkCollect(dir, files, trackDirs ? (directories as Set) : null, ctx); - const full = path.join(dir, entry.name); - if (entry.isDirectory()) { - if (trackDirs) { - collectFiles( - full, - files, - config, - directories as Set, - _visited, - rootDir, - includeRegexes, - excludeRegexes, - ); - } else { - collectFiles(full, files, config, null, _visited, rootDir, includeRegexes, excludeRegexes); - } - } else if (EXTENSIONS.has(path.extname(entry.name))) { - if (hasGlobFilters) { - const rel = normalizePath(path.relative(rootDir, full)); - if (!passesIncludeExclude(rel, includeRegexes, excludeRegexes)) continue; - } - files.push(full); - hasFiles = true; - } - } - if (trackDirs && hasFiles) { - (directories as Set).add(dir); - } return trackDirs ? { files, directories: directories as Set } : files; } diff --git a/src/domain/graph/builder/incremental.ts b/src/domain/graph/builder/incremental.ts index 66853983e..d7aa488ed 100644 --- a/src/domain/graph/builder/incremental.ts +++ b/src/domain/graph/builder/incremental.ts @@ -307,6 +307,63 @@ function resolveBarrelImportEdges( return edgesAdded; } +/** Emit symbol-level `imports-type` edges for a single `import type` statement. */ +function emitTypeOnlySymbolEdges( + db: BetterSqlite3Database | null, + stmts: IncrementalStmts, + imp: ExtractorOutput['imports'][number], + resolvedPath: string, + fileNodeId: number, +): number { + let edgesAdded = 0; + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + let targetFile = resolvedPath; + if (db && isBarrelFile(db, resolvedPath)) { + const actual = resolveBarrelTarget(db, resolvedPath, cleanName); + if (actual) targetFile = actual; + } + const candidates = stmts.findNodeInFile.all(cleanName, targetFile) as Array<{ + id: number; + file: string; + }>; + if (candidates.length === 0) continue; + stmts.insertEdge.run(fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0); + edgesAdded++; + } + return edgesAdded; +} + +/** + * Process a single import statement: emit the file→file edge, any + * symbol-level type-only edges, and barrel re-export edges. + */ +function emitEdgesForImport( + stmts: IncrementalStmts, + imp: ExtractorOutput['imports'][number], + fileNodeId: number, + relPath: string, + rootDir: string, + aliases: PathAliases, + db: BetterSqlite3Database | null, +): number { + const resolvedPath = resolveImportPath(path.join(rootDir, relPath), imp.source, rootDir, aliases); + const targetRow = stmts.getNodeId.get(resolvedPath, 'file', resolvedPath, 0); + if (!targetRow) return 0; + + const edgeKind = imp.reexport ? 'reexports' : imp.typeOnly ? 'imports-type' : 'imports'; + stmts.insertEdge.run(fileNodeId, targetRow.id, edgeKind, 1.0, 0); + let edgesAdded = 1; + + if (imp.typeOnly) { + edgesAdded += emitTypeOnlySymbolEdges(db, stmts, imp, resolvedPath, fileNodeId); + } + if (!imp.reexport && db) { + edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeId, resolvedPath, imp); + } + return edgesAdded; +} + function buildImportEdges( stmts: IncrementalStmts, relPath: string, @@ -318,44 +375,7 @@ function buildImportEdges( ): number { let edgesAdded = 0; for (const imp of symbols.imports) { - const resolvedPath = resolveImportPath( - path.join(rootDir, relPath), - imp.source, - rootDir, - aliases, - ); - const targetRow = stmts.getNodeId.get(resolvedPath, 'file', resolvedPath, 0); - if (targetRow) { - const edgeKind = imp.reexport ? 'reexports' : imp.typeOnly ? 'imports-type' : 'imports'; - stmts.insertEdge.run(fileNodeId, targetRow.id, edgeKind, 1.0, 0); - edgesAdded++; - - // Type-only imports: create symbol-level edges so the target symbols - // get fan-in credit and aren't falsely classified as dead code. - if (imp.typeOnly) { - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - let targetFile = resolvedPath; - if (db && isBarrelFile(db, resolvedPath)) { - const actual = resolveBarrelTarget(db, resolvedPath, cleanName); - if (actual) targetFile = actual; - } - const candidates = stmts.findNodeInFile.all(cleanName, targetFile) as Array<{ - id: number; - file: string; - }>; - if (candidates.length > 0) { - stmts.insertEdge.run(fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0); - edgesAdded++; - } - } - } - - // Barrel resolution: create edges through re-export chains - if (!imp.reexport && db) { - edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeId, resolvedPath, imp); - } - } + edgesAdded += emitEdgesForImport(stmts, imp, fileNodeId, relPath, rootDir, aliases, db); } return edgesAdded; } @@ -491,6 +511,122 @@ function buildCallEdges( // ── Main entry point ──────────────────────────────────────────────────── +/** Build the "this file was deleted" result returned by `rebuildFile`. */ +function buildDeletionResult( + relPath: string, + oldNodes: number, + oldSymbols: unknown[], + diffSymbols: ((old: unknown[], new_: unknown[]) => unknown) | undefined, +): RebuildResult { + const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, []) : null; + return { + file: relPath, + nodesAdded: 0, + nodesRemoved: oldNodes, + edgesAdded: 0, + deleted: true, + event: 'deleted', + symbolDiff, + nodesBefore: oldNodes, + nodesAfter: 0, + }; +} + +/** Rebuild all edges originating in the single (just-parsed) target file. */ +function rebuildEdgesForTargetFile( + db: BetterSqlite3Database, + stmts: IncrementalStmts, + relPath: string, + symbols: ExtractorOutput, + fileNodeRow: { id: number }, + rootDir: string, +): number { + const aliases: PathAliases = { baseUrl: null, paths: {} }; + let edgesAdded = buildContainmentEdges(db, stmts, relPath, symbols); + edgesAdded += rebuildDirContainment(db, stmts, relPath); + edgesAdded += buildImportEdges(stmts, relPath, symbols, rootDir, fileNodeRow.id, aliases, db); + const importedNames = buildImportedNamesMap(symbols, rootDir, relPath, aliases); + edgesAdded += buildCallEdges(stmts, relPath, symbols, fileNodeRow, importedNames); + return edgesAdded; +} + +/** + * Re-parse the reverse-deps and delete their outgoing edges so the cascade + * can rebuild them. + */ +async function parseReverseDeps( + db: BetterSqlite3Database, + rootDir: string, + reverseDeps: string[], + engineOpts: EngineOpts, + cache: unknown, +): Promise> { + const depSymbols = new Map(); + for (const depRelPath of reverseDeps) { + const symbols_ = await parseReverseDep(rootDir, depRelPath, engineOpts, cache); + if (symbols_) { + deleteOutgoingEdges(db, depRelPath); + depSymbols.set(depRelPath, symbols_); + } + } + return depSymbols; +} + +/** + * Pass 2 of the reverse-dep cascade: now that the changed file's `reexports` + * edges exist, resolve barrel imports for every reverse-dep so transitive + * call edges through the barrel still find their targets. + */ +function emitBarrelImportEdgesForReverseDeps( + db: BetterSqlite3Database, + stmts: IncrementalStmts, + depSymbols: Map, + rootDir: string, +): number { + let edgesAdded = 0; + for (const [depRelPath, symbols_] of depSymbols) { + const fileNodeRow_ = stmts.getNodeId.get(depRelPath, 'file', depRelPath, 0); + if (!fileNodeRow_) continue; + const aliases_: PathAliases = { baseUrl: null, paths: {} }; + for (const imp of symbols_.imports) { + if (imp.reexport) continue; + const resolvedPath = resolveImportPath( + path.join(rootDir, depRelPath), + imp.source, + rootDir, + aliases_, + ); + edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeRow_.id, resolvedPath, imp); + } + } + return edgesAdded; +} + +/** + * Two-pass reverse-dep cascade: + * 1. Rebuild direct edges (creating `reexports` edges for barrels). + * 2. Add barrel import edges (which need `reexports` edges to exist). + */ +async function runReverseDepCascade( + db: BetterSqlite3Database, + rootDir: string, + reverseDeps: string[], + stmts: IncrementalStmts, + engineOpts: EngineOpts, + cache: unknown, +): Promise { + const depSymbols = await parseReverseDeps(db, rootDir, reverseDeps, engineOpts, cache); + + let edgesAdded = 0; + // Pass 1: direct edges only (no barrel resolution) — creates reexports edges + for (const [depRelPath, symbols_] of depSymbols) { + edgesAdded += rebuildReverseDepEdges(db, rootDir, depRelPath, symbols_, stmts, true); + } + // Pass 2: add barrel import edges (reexports edges now exist) + edgesAdded += emitBarrelImportEdgesForReverseDeps(db, stmts, depSymbols, rootDir); + return edgesAdded; +} + /** * Parse a single file and update the database incrementally. */ @@ -519,18 +655,7 @@ export async function rebuildFile( if (!fs.existsSync(filePath)) { if (cache) (cache as { remove(p: string): void }).remove(filePath); - const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, []) : null; - return { - file: relPath, - nodesAdded: 0, - nodesRemoved: oldNodes, - edgesAdded: 0, - deleted: true, - event: 'deleted', - symbolDiff, - nodesBefore: oldNodes, - nodesAfter: 0, - }; + return buildDeletionResult(relPath, oldNodes, oldSymbols, diffSymbols); } let code: string; @@ -553,45 +678,8 @@ export async function rebuildFile( if (!fileNodeRow) return { file: relPath, nodesAdded: newNodes, nodesRemoved: oldNodes, edgesAdded: 0 }; - const aliases: PathAliases = { baseUrl: null, paths: {} }; - - let edgesAdded = buildContainmentEdges(db, stmts, relPath, symbols); - edgesAdded += rebuildDirContainment(db, stmts, relPath); - edgesAdded += buildImportEdges(stmts, relPath, symbols, rootDir, fileNodeRow.id, aliases, db); - const importedNames = buildImportedNamesMap(symbols, rootDir, relPath, aliases); - edgesAdded += buildCallEdges(stmts, relPath, symbols, fileNodeRow, importedNames); - - // Cascade: rebuild outgoing edges for reverse-dep files. - // Two-pass approach: first rebuild direct edges (creating reexports edges for barrels), - // then add barrel import edges (which need reexports edges to exist for resolution). - const depSymbols = new Map(); - for (const depRelPath of reverseDeps) { - const symbols_ = await parseReverseDep(rootDir, depRelPath, engineOpts, cache); - if (symbols_) { - deleteOutgoingEdges(db, depRelPath); - depSymbols.set(depRelPath, symbols_); - } - } - // Pass 1: direct edges only (no barrel resolution) — creates reexports edges - for (const [depRelPath, symbols_] of depSymbols) { - edgesAdded += rebuildReverseDepEdges(db, rootDir, depRelPath, symbols_, stmts, true); - } - // Pass 2: add barrel import edges (reexports edges now exist) - for (const [depRelPath, symbols_] of depSymbols) { - const fileNodeRow_ = stmts.getNodeId.get(depRelPath, 'file', depRelPath, 0); - if (!fileNodeRow_) continue; - const aliases_: PathAliases = { baseUrl: null, paths: {} }; - for (const imp of symbols_.imports) { - if (imp.reexport) continue; - const resolvedPath = resolveImportPath( - path.join(rootDir, depRelPath), - imp.source, - rootDir, - aliases_, - ); - edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeRow_.id, resolvedPath, imp); - } - } + let edgesAdded = rebuildEdgesForTargetFile(db, stmts, relPath, symbols, fileNodeRow, rootDir); + edgesAdded += await runReverseDepCascade(db, rootDir, reverseDeps, stmts, engineOpts, cache); const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, newSymbols) : null; const event = oldNodes === 0 ? 'added' : 'modified'; diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index b18d3c473..ff4ee5e5d 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -8,52 +8,24 @@ import fs from 'node:fs'; import path from 'node:path'; import { performance } from 'node:perf_hooks'; import { - acquireAdvisoryLock, closeDb, closeDbPair, getBuildMeta, initSchema, MIGRATIONS, openDb, - purgeFilesData, - releaseAdvisoryLock, - setBuildMeta, } from '../../../db/index.js'; import { detectWorkspaces, loadConfig } from '../../../infrastructure/config.js'; import { debug, info, warn } from '../../../infrastructure/logger.js'; import { loadNative } from '../../../infrastructure/native.js'; -import { semverCompare } from '../../../infrastructure/update-check.js'; -import { normalizePath } from '../../../shared/constants.js'; import { toErrorMessage } from '../../../shared/errors.js'; import { CODEGRAPH_VERSION } from '../../../shared/version.js'; -import type { - BetterSqlite3Database, - BuildGraphOpts, - BuildResult, - Definition, - ExtractorOutput, - SqliteStatement, -} from '../../../types.js'; -import { - classifyNativeDrops, - formatDropExtensionSummary, - getActiveEngine, - getInstalledWasmExtensions, - NATIVE_SUPPORTED_EXTENSIONS, - parseFilesWasmForBackfill, -} from '../../parser.js'; +import type { BuildGraphOpts, BuildResult } from '../../../types.js'; +import { getActiveEngine } from '../../parser.js'; import { writeJournalHeader } from '../journal.js'; import { setWorkspaces } from '../resolve.js'; import { PipelineContext } from './context.js'; -import { - batchInsertNodes, - collectFiles as collectFilesUtil, - fileHash, - fileStat, - loadPathAliases, - readFileSafe, -} from './helpers.js'; -import { NativeDbProxy } from './native-db-proxy.js'; +import { loadPathAliases } from './helpers.js'; import { buildEdges } from './stages/build-edges.js'; import { buildStructure } from './stages/build-structure.js'; // Pipeline stages @@ -61,10 +33,24 @@ import { collectFiles } from './stages/collect-files.js'; import { detectChanges, detectNoChanges } from './stages/detect-changes.js'; import { finalize } from './stages/finalize.js'; import { insertNodes } from './stages/insert-nodes.js'; +import { + closeNativeDb, + refreshJsDb, + reopenNativeDb, + suspendNativeDb, +} from './stages/native-db-lifecycle.js'; +import { tryNativeOrchestrator } from './stages/native-orchestrator.js'; import { parseFiles } from './stages/parse-files.js'; import { resolveImports } from './stages/resolve-imports.js'; import { runAnalyses } from './stages/run-analyses.js'; +// Re-export computeWasmOnlyStaleFiles for backward compatibility with tests +// that import from this module path (#1073 unit tests). +export { + computeWasmOnlyStaleFiles, + type WasmOnlyStaleFilesInput, +} from './stages/native-orchestrator.js'; + // ── Setup helpers ─────────────────────────────────────────────────────── function initializeEngine(ctx: PipelineContext): void { @@ -237,932 +223,8 @@ function formatTimingResult(ctx: PipelineContext): BuildResult { }; } -// ── NativeDb lifecycle helpers ────────────────────────────────────────── - -/** Checkpoint WAL through rusqlite and close the native connection. */ -function closeNativeDb(ctx: PipelineContext, label: string): void { - if (!ctx.nativeDb) return; - try { - ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); - } catch (e) { - debug(`${label} WAL checkpoint failed: ${toErrorMessage(e)}`); - } - try { - ctx.nativeDb.close(); - } catch (e) { - debug(`${label} nativeDb close failed: ${toErrorMessage(e)}`); - } - ctx.nativeDb = undefined; -} - -/** Try to reopen the native connection for a given pipeline phase. */ -function reopenNativeDb(ctx: PipelineContext, label: string): void { - if ((ctx.opts.engine ?? 'auto') === 'wasm') return; - const native = loadNative(); - if (!native?.NativeDatabase) return; - try { - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - } catch (e) { - debug(`reopen nativeDb for ${label} failed: ${toErrorMessage(e)}`); - ctx.nativeDb = undefined; - } -} - -/** Close nativeDb and clear stale references in engineOpts. */ -function suspendNativeDb(ctx: PipelineContext, label: string): void { - closeNativeDb(ctx, label); - if (ctx.engineOpts?.nativeDb) { - ctx.engineOpts.nativeDb = undefined; - } -} - -/** - * After native writes, reopen the JS db connection to get a fresh page cache. - * Rusqlite WAL truncation invalidates better-sqlite3's internal WAL index, - * causing SQLITE_CORRUPT on the next read (#715, #736). - */ -function refreshJsDb(ctx: PipelineContext): void { - try { - ctx.db.close(); - } catch (e) { - debug(`refreshJsDb close failed: ${toErrorMessage(e)}`); - } - ctx.db = openDb(ctx.dbPath); -} - -// ── Native orchestrator types ────────────────────────────────────────── - -interface NativeOrchestratorResult { - phases: Record; - earlyExit?: boolean; - nodeCount?: number; - edgeCount?: number; - fileCount?: number; - changedFiles?: string[]; - changedCount?: number; - removedCount?: number; - isFullBuild?: boolean; - /** Whether the Rust pipeline handled the structure phase (small-incremental fast path). */ - structureHandled?: boolean; - /** Whether the Rust pipeline wrote AST/complexity/CFG/dataflow to DB. */ - analysisComplete?: boolean; -} - -// ── Native orchestrator helpers ─────────────────────────────────────── - -/** Determine whether the native orchestrator should be skipped. Returns a reason string, or null if it should run. */ -function shouldSkipNativeOrchestrator(ctx: PipelineContext): string | null { - if (ctx.forceFullRebuild) return 'forceFullRebuild'; - // v3.9.0 addon had buggy incremental purge (wrong SQL on analysis tables, - // scoped removal over-detection). Fixed in v3.9.1 by PR #865. Gate on - // < 3.9.1 so v3.9.1+ uses the fast Rust orchestrator path. - const orchestratorBuggy = !!ctx.engineVersion && semverCompare(ctx.engineVersion, '3.9.1') < 0; - if (orchestratorBuggy) return `buggy addon ${ctx.engineVersion}`; - if (ctx.engineName !== 'native') return `engine=${ctx.engineName}`; - return null; -} - -/** Checkpoint WAL through rusqlite, close nativeDb, and reopen better-sqlite3. - * Returns false if the DB reopen fails (caller should return partial result). */ -function handoffWalAfterNativeBuild(ctx: PipelineContext): boolean { - closeNativeDb(ctx, 'post-native-build'); - try { - ctx.db.close(); - } catch (e) { - debug(`handoffWal JS db close failed: ${toErrorMessage(e)}`); - } - try { - ctx.db = openDb(ctx.dbPath); - return true; - } catch (reopenErr) { - warn(`Failed to reopen DB after native build: ${(reopenErr as Error).message}`); - return false; - } -} - -/** - * Reconstruct fileSymbols from the DB after a native orchestrator build. - * When `scopeFiles` is provided, only loads those files (for analysis-only). - * When omitted, loads all files (needed for structure rebuilds). - */ -function reconstructFileSymbolsFromDb( - ctx: PipelineContext, - scopeFiles?: string[], -): Map { - let query = - 'SELECT file, name, kind, line, end_line as endLine FROM nodes WHERE file IS NOT NULL'; - const params: string[] = []; - if (scopeFiles && scopeFiles.length > 0) { - const placeholders = scopeFiles.map(() => '?').join(','); - query += ` AND file IN (${placeholders})`; - params.push(...scopeFiles); - } - query += ' ORDER BY file, line'; - - const rows = ctx.db.prepare(query).all(...params) as { - file: string; - name: string; - kind: string; - line: number; - endLine: number | null; - }[]; - - const fileSymbols = new Map(); - for (const row of rows) { - let entry = fileSymbols.get(row.file); - if (!entry) { - entry = { - definitions: [], - calls: [], - imports: [], - classes: [], - exports: [], - typeMap: new Map(), - }; - fileSymbols.set(row.file, entry); - } - entry.definitions.push({ - name: row.name, - kind: row.kind as Definition['kind'], - line: row.line, - endLine: row.endLine ?? undefined, - }); - } - - // Populate import/export counts from DB edges so buildStructure - // computes correct import_count/export_count in node_metrics. - // The extractor arrays aren't persisted to the DB, so we derive - // counts from edge data instead (#804). - const importCountRows = ctx.db - .prepare( - `SELECT n.file, COUNT(*) AS cnt - FROM edges e JOIN nodes n ON e.source_id = n.id - WHERE e.kind IN ('imports', 'imports-type', 'dynamic-imports') - AND n.file IS NOT NULL - GROUP BY n.file`, - ) - .all() as { file: string; cnt: number }[]; - for (const row of importCountRows) { - const entry = fileSymbols.get(row.file); - if (entry) entry.imports = new Array(row.cnt) as ExtractorOutput['imports']; - } - - const exportCountRows = ctx.db - .prepare( - `SELECT n_tgt.file, COUNT(DISTINCT n_tgt.id) AS cnt - FROM edges e - JOIN nodes n_tgt ON e.target_id = n_tgt.id - JOIN nodes n_src ON e.source_id = n_src.id - WHERE e.kind IN ('imports', 'imports-type', 'reexports') - AND n_tgt.file IS NOT NULL - AND n_src.file != n_tgt.file - GROUP BY n_tgt.file`, - ) - .all() as { file: string; cnt: number }[]; - for (const row of exportCountRows) { - const entry = fileSymbols.get(row.file); - if (entry) entry.exports = new Array(row.cnt) as ExtractorOutput['exports']; - } - - return fileSymbols; -} - -/** - * Run JS buildStructure() after native orchestrator to fill directory nodes + contains edges. - * For full builds, passes changedFiles=null (full rebuild). - * For incremental builds, passes the changed file list to scope the update. - */ -async function runPostNativeStructure( - ctx: PipelineContext, - allFileSymbols: Map, - isFullBuild: boolean, - changedFiles: string[] | undefined, -): Promise { - const structureStart = performance.now(); - try { - const directories = new Set(); - for (const relPath of allFileSymbols.keys()) { - const parts = relPath.split('/'); - for (let i = 1; i < parts.length; i++) { - directories.add(parts.slice(0, i).join('/')); - } - } - - const lineCountMap = new Map(); - const cachedLineCounts = ctx.db - .prepare( - `SELECT n.name AS file, m.line_count - FROM node_metrics m JOIN nodes n ON m.node_id = n.id - WHERE n.kind = 'file'`, - ) - .all() as Array<{ file: string; line_count: number }>; - for (const row of cachedLineCounts) { - lineCountMap.set(row.file, row.line_count); - } - - // Full builds need null (rebuild everything). Incremental builds pass the - // changed file list so buildStructure only updates those files' metrics - // and contains edges — matching the JS pipeline's medium-incremental path. - const changedFilePaths = isFullBuild || !changedFiles?.length ? null : changedFiles; - const { buildStructure: buildStructureFn } = (await import( - '../../../features/structure.js' - )) as { - buildStructure: ( - db: typeof ctx.db, - fileSymbols: Map, - rootDir: string, - lineCountMap: Map, - directories: Set, - changedFiles: string[] | null, - ) => void; - }; - buildStructureFn( - ctx.db, - allFileSymbols, - ctx.rootDir, - lineCountMap, - directories, - changedFilePaths, - ); - debug( - `Structure phase completed after native orchestrator${changedFilePaths ? ` (${changedFilePaths.length} files)` : ' (full)'}`, - ); - } catch (err) { - warn(`Structure phase failed after native build: ${toErrorMessage(err)}`); - } - return performance.now() - structureStart; -} - -/** - * JS fallback for AST/complexity/CFG/dataflow analysis after native orchestrator. - * Used when the Rust addon doesn't include analysis persistence (older addon - * version) or when analysis failed on the Rust side. - */ -async function runPostNativeAnalysis( - ctx: PipelineContext, - allFileSymbols: Map, - changedFiles: string[] | undefined, -): Promise<{ astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }> { - const timing = { astMs: 0, complexityMs: 0, cfgMs: 0, dataflowMs: 0 }; - - // Scope analysis fileSymbols to changed files only - let analysisFileSymbols: Map; - if (changedFiles && changedFiles.length > 0) { - analysisFileSymbols = new Map(); - for (const f of changedFiles) { - const entry = allFileSymbols.get(f); - if (entry) analysisFileSymbols.set(f, entry); - } - } else { - analysisFileSymbols = allFileSymbols; - } - - // Reopen nativeDb for analysis features (suspend/resume WAL pattern). - const native = loadNative(); - if (native?.NativeDatabase) { - try { - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - if (ctx.engineOpts) ctx.engineOpts.nativeDb = ctx.nativeDb; - } catch { - ctx.nativeDb = undefined; - if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; - } - } - - // Flush JS WAL pages once so Rust can see them, then no-op callbacks. - // Previously each feature called wal_checkpoint(TRUNCATE) individually - // (~68ms each × 3-4 features). One FULL checkpoint suffices. - if (ctx.nativeDb && ctx.engineOpts) { - ctx.db.pragma('wal_checkpoint(FULL)'); - ctx.engineOpts.suspendJsDb = () => {}; - ctx.engineOpts.resumeJsDb = () => {}; - } - - try { - const { runAnalyses: runAnalysesFn } = (await import('../../../ast-analysis/engine.js')) as { - runAnalyses: ( - db: BetterSqlite3Database, - fileSymbols: Map, - rootDir: string, - opts: Record, - engineOpts?: Record, - ) => Promise<{ astMs?: number; complexityMs?: number; cfgMs?: number; dataflowMs?: number }>; - }; - const result = await runAnalysesFn( - ctx.db, - analysisFileSymbols, - ctx.rootDir, - ctx.opts as Record, - ctx.engineOpts as unknown as Record | undefined, - ); - timing.astMs = result.astMs ?? 0; - timing.complexityMs = result.complexityMs ?? 0; - timing.cfgMs = result.cfgMs ?? 0; - timing.dataflowMs = result.dataflowMs ?? 0; - } catch (err) { - warn(`Analysis phases failed after native build: ${toErrorMessage(err)}`); - } - - // Close nativeDb after analyses — TRUNCATE checkpoint flushes all Rust - // WAL writes so JS and external readers can see them. Runs once after - // all analysis features complete (not per-feature). - if (ctx.nativeDb) { - try { - ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); - } catch { - /* ignore checkpoint errors */ - } - try { - ctx.nativeDb.close(); - } catch { - /* ignore close errors */ - } - ctx.nativeDb = undefined; - if (ctx.engineOpts) { - ctx.engineOpts.nativeDb = undefined; - ctx.engineOpts.suspendJsDb = undefined; - ctx.engineOpts.resumeJsDb = undefined; - } - } - - return timing; -} - -/** Format timing result from native orchestrator phases + JS post-processing. */ -function formatNativeTimingResult( - p: Record, - structurePatchMs: number, - analysisTiming: { astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }, -): BuildResult { - return { - phases: { - setupMs: +(p.setupMs ?? 0).toFixed(1), - collectMs: +(p.collectMs ?? 0).toFixed(1), - detectMs: +(p.detectMs ?? 0).toFixed(1), - parseMs: +(p.parseMs ?? 0).toFixed(1), - insertMs: +(p.insertMs ?? 0).toFixed(1), - resolveMs: +(p.resolveMs ?? 0).toFixed(1), - edgesMs: +(p.edgesMs ?? 0).toFixed(1), - structureMs: +((p.structureMs ?? 0) + structurePatchMs).toFixed(1), - rolesMs: +(p.rolesMs ?? 0).toFixed(1), - astMs: +(analysisTiming.astMs ?? 0).toFixed(1), - complexityMs: +(analysisTiming.complexityMs ?? 0).toFixed(1), - cfgMs: +(analysisTiming.cfgMs ?? 0).toFixed(1), - dataflowMs: +(analysisTiming.dataflowMs ?? 0).toFixed(1), - finalizeMs: +(p.finalizeMs ?? 0).toFixed(1), - }, - }; -} - -/** Try the native build orchestrator. Returns a BuildResult on success, undefined to fall through to JS pipeline. */ -async function tryNativeOrchestrator( - ctx: PipelineContext, -): Promise { - const skipReason = shouldSkipNativeOrchestrator(ctx); - if (skipReason) { - debug(`Skipping native orchestrator: ${skipReason}`); - return undefined; - } - - // Open NativeDatabase on demand — deferred from setupPipeline to skip the - // ~60ms cost on no-op/early-exit builds. Close the better-sqlite3 connection - // first to avoid dual-connection WAL corruption. - if (!ctx.nativeDb && ctx.nativeAvailable) { - const native = loadNative(); - if (native?.NativeDatabase) { - try { - // Close better-sqlite3 before opening rusqlite to avoid WAL conflicts. - // Uses raw close() instead of closeDb() intentionally — the advisory lock - // is kept and transferred to the NativeDbProxy below, not released here. - ctx.db.close(); - acquireAdvisoryLock(ctx.dbPath); - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - ctx.nativeDb.initSchema(); - // Replace ctx.db with a NativeDbProxy so post-native JS fallback - // (structure, analysis) can use it without reopening better-sqlite3. - const proxy = new NativeDbProxy(ctx.nativeDb); - proxy.__lockPath = `${ctx.dbPath}.lock`; - ctx.db = proxy as unknown as typeof ctx.db; - ctx.nativeFirstProxy = true; - } catch (err) { - warn(`NativeDatabase setup failed, falling back to JS: ${toErrorMessage(err)}`); - try { - ctx.nativeDb?.close(); - } catch (e) { - debug(`tryNativeOrchestrator: close failed during fallback: ${toErrorMessage(e)}`); - } - ctx.nativeDb = undefined; - ctx.nativeFirstProxy = false; // defensive: reset in case future refactors move the assignment above throwing lines - releaseAdvisoryLock(`${ctx.dbPath}.lock`); - // Reopen better-sqlite3 for JS pipeline fallback - ctx.db = openDb(ctx.dbPath); - } - } - } - - if (!ctx.nativeDb?.buildGraph) return undefined; - - const resultJson = ctx.nativeDb.buildGraph( - ctx.rootDir, - JSON.stringify(ctx.config), - JSON.stringify(ctx.aliases), - JSON.stringify(ctx.opts), - ); - const result = JSON.parse(resultJson) as NativeOrchestratorResult; - - if (result.earlyExit) { - info('No changes detected'); - // Even on no-op rebuilds, dropped-language files added since the last - // full build are still missing from `nodes`/`file_hashes` (#1083), and - // WASM-only files deleted from disk leave stale rows behind (#1073). - // The orchestrator's file_collector skipped them, so its earlyExit - // doesn't imply DB consistency. Run the gap repair before returning. - const gap = detectDroppedLanguageGap(ctx); - if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) { - await backfillNativeDroppedFiles(ctx, gap); - } - closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); - return 'early-exit'; - } - - // Log incremental status to match JS pipeline output - const changed = result.changedCount ?? 0; - const removed = result.removedCount ?? 0; - if (!result.isFullBuild && (changed > 0 || removed > 0)) { - info(`Incremental: ${changed} changed, ${removed} removed`); - } - - const p = result.phases; - - // Sync build_meta so JS-side version/engine checks work on next build. - // Use the binary's CARGO_PKG_VERSION (ctx.nativeBinaryVersion), not the - // platform package.json version (ctx.engineVersion). The Rust side's - // check_version_mismatch compares against CARGO_PKG_VERSION; writing - // the package.json value would create a permanent mismatch whenever - // the binary and platform package.json diverge — e.g., CI hot-swap - // via ci-install-native.mjs (#1066) — forcing every subsequent build - // to be a full rebuild. - // - // When the native addon doesn't expose engineVersion() (older addon), - // fall back to CODEGRAPH_VERSION — same fallback used by both - // checkEngineSchemaMismatch (read path) and persistBuildMetadata - // (the JS-pipeline write path in finalize.ts). Using ctx.engineVersion - // here would re-introduce the asymmetry this PR fixes for that case. - const nativeVersionForMeta = ctx.nativeBinaryVersion || CODEGRAPH_VERSION; - setBuildMeta(ctx.db, { - engine: ctx.engineName, - engine_version: nativeVersionForMeta, - codegraph_version: nativeVersionForMeta, - schema_version: String(ctx.schemaVersion), - built_at: new Date().toISOString(), - }); - - info( - `Native build orchestrator completed: ${result.nodeCount ?? 0} nodes, ${result.edgeCount ?? 0} edges, ${result.fileCount ?? 0} files`, - ); - - // ── Post-native structure + analysis ────────────────────────────── - let analysisTiming = { - astMs: +(p.astMs ?? 0), - complexityMs: +(p.complexityMs ?? 0), - cfgMs: +(p.cfgMs ?? 0), - dataflowMs: +(p.dataflowMs ?? 0), - }; - let structurePatchMs = 0; - // Skip JS structure when the Rust pipeline's small-incremental fast path - // already handled it. For full builds and large incrementals where Rust - // skipped structure, we must run the JS fallback. - const needsStructure = !result.structureHandled; - // When the Rust addon doesn't include analysis persistence (older addon - // version or analysis failed), fall back to JS-side analysis. - const needsAnalysisFallback = - !result.analysisComplete && - (ctx.opts.ast !== false || - ctx.opts.complexity !== false || - ctx.opts.cfg !== false || - ctx.opts.dataflow !== false); - - if (needsStructure || needsAnalysisFallback) { - // When analysis fallback is needed, handoff to better-sqlite3 — the - // analysis engine uses the suspend/resume WAL pattern that requires a - // real better-sqlite3 connection, not the NativeDbProxy. - if (needsAnalysisFallback && ctx.nativeFirstProxy) { - closeNativeDb(ctx, 'pre-analysis-fallback'); - ctx.db = openDb(ctx.dbPath); - ctx.nativeFirstProxy = false; - } else if (!ctx.nativeFirstProxy && !handoffWalAfterNativeBuild(ctx)) { - // DB reopen failed — return partial result - return formatNativeTimingResult(p, 0, analysisTiming); - } - - const fileSymbols = reconstructFileSymbolsFromDb(ctx); - - if (needsStructure) { - structurePatchMs = await runPostNativeStructure( - ctx, - fileSymbols, - !!result.isFullBuild, - result.changedFiles, - ); - } - - if (needsAnalysisFallback) { - analysisTiming = await runPostNativeAnalysis(ctx, fileSymbols, result.changedFiles); - } - } - - // Engine parity: the native orchestrator silently drops files whose - // Rust extractor/grammar is missing or fails (e.g. HCL, Scala, Swift on - // stale native binaries). WASM handles those — backfill via WASM so both - // engines process the same file set (#967). - // - // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for - // both gating and the backfill itself. On dirty incrementals/full builds - // the orchestrator signals trigger backfill, so the walk happens once - // (instead of redundantly inside backfill). On quiet incrementals we - // still pay the walk so we can detect brand-new files in dropped-language - // extensions — a gap that the orchestrator's `detect_removed_files` - // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap - // because the expensive part (WASM re-parse of the missing set) is - // gated below. - const removedCount = result.removedCount ?? 0; - const changedCount = result.changedCount ?? 0; - const gap = detectDroppedLanguageGap(ctx); - if ( - result.isFullBuild || - removedCount > 0 || - changedCount > 0 || - gap.missingAbs.length > 0 || - gap.staleRel.length > 0 - ) { - await backfillNativeDroppedFiles(ctx, gap); - } - - closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); - return formatNativeTimingResult(p, structurePatchMs, analysisTiming); -} - -/** Files the native orchestrator silently dropped — the working set for backfill. */ -interface DroppedLanguageGap { - /** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */ - missingRel: string[]; - /** Absolute paths, aligned by index with `missingRel`. */ - missingAbs: string[]; - /** - * Relative paths of WASM-only files present in DB but absent from disk (#1073). - * Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side - * backfill must purge them. Always disjoint from `missingRel`. - */ - staleRel: string[]; -} - -/** - * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper - * is pure and unit-testable independently of `getInstalledWasmExtensions` and - * the `NATIVE_SUPPORTED_EXTENSIONS` global state. - */ -export interface WasmOnlyStaleFilesInput { - /** Distinct `file` values from the `nodes` table. */ - existingNodes: ReadonlySet; - /** Distinct `file` values from the `file_hashes` table. */ - existingHashes: ReadonlySet; - /** Relative paths currently on disk (from `collectFilesUtil`). */ - expected: ReadonlySet; - /** Lowercased extensions whose WASM grammar is installed. */ - installedExts: ReadonlySet; - /** Extensions covered by the Rust addon — Rust owns deletion for these. */ - nativeSupported: ReadonlySet; -} - -/** - * Compute the WASM-only files present in the DB but missing from disk (#1073). - * - * Returns relative paths that: - * - appear in `existingNodes` or `existingHashes` (in DB), - * - are absent from `expected` (not on disk), - * - have an extension installed for WASM, AND - * - have an extension NOT covered by `nativeSupported` — Rust's - * `purge_changed_files` handles deletion for natively-supported extensions - * via its own `detect_removed_files`, so the caller must not double-purge. - * - * Extensions are lowercased before lookup to match the registry and Rust's - * `LanguageKind::from_extension` (which normalises case for the languages - * where both cases are conventional, e.g. R's `.r` / `.R`). - * - * DB paths are forced to forward slashes before comparison with `expected` - * (which is always normalised). The on-disk invariant is that DB rows are - * written with forward slashes, but a stale row written by older code on - * Windows could carry back-slashes — normalising here makes the comparison - * platform-safe and prevents false-positive purges of live rows. We replace - * `\\` explicitly (rather than calling `normalizePath`, which only touches - * `path.sep`) so the defence works when running on POSIX against a DB that - * was migrated from Windows. - * - * Exported for unit testing. - */ -export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { - const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; - const stale: string[] = []; - const seen = new Set(); - const consider = (rawRel: string): void => { - const rel = rawRel.replace(/\\/g, '/'); - if (expected.has(rel) || seen.has(rel)) return; - const ext = path.extname(rel).toLowerCase(); - if (nativeSupported.has(ext)) return; - if (!installedExts.has(ext)) return; - seen.add(rel); - // Push the ORIGINAL raw path (not the normalised form) so the eventual - // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData` - // matches the actual stored row. The dedup `seen` set keeps the - // normalised form so a file written once with `\` and once with `/` - // is still treated as one entry — but the value the SQL sees has to - // be byte-identical to what's on disk in the DB. - stale.push(rawRel); - }; - for (const rel of existingNodes) consider(rel); - for (const rel of existingHashes) consider(rel); - return stale; -} - -/** - * Group relative paths by their lowercased extension. Shape matches the bucket - * type that `formatDropExtensionSummary` consumes, so callers can render a - * log-friendly per-extension summary without going through `classifyNativeDrops` - * when the reason is already known (e.g. the stale-purge path where every path - * is guaranteed `unsupported-by-native`). - */ -function groupByExtension(relPaths: Iterable): Map { - const buckets = new Map(); - for (const rel of relPaths) { - const ext = path.extname(rel).toLowerCase(); - let list = buckets.get(ext); - if (!list) { - list = []; - buckets.set(ext, list); - } - list.push(rel); - } - return buckets; -} - -/** - * Detect files the native orchestrator silently dropped. - * - * Walks the filesystem and compares against `nodes` + `file_hashes`. A file - * is "missing" if it's absent from EITHER table — both must be present for - * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g. - * legacy DBs where `nodes` was populated but `file_hashes` was not). - * - * Restricted to files with an installed WASM grammar; extensions in - * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal - * installs) can't be parsed by either engine, so they're not a native - * regression — excluding them keeps the warn count in - * `backfillNativeDroppedFiles` meaningful. - * - * Also detects WASM-only files deleted from disk (#1073). Rust's - * `detect_removed_files` filter (#1070) skips files outside its supported - * extensions, so deletions of WASM-only languages don't reach the native - * purge path; the rest of the backfill only inserts rows, so without this - * step stale `nodes`/`file_hashes` rows would linger across incremental - * rebuilds until the next full rebuild. - * - * Cheap (no DB handoff, no parsing): used both to gate the backfill call - * and as its working set. NativeDbProxy supports `.prepare().all()`, so - * this works whether `ctx.db` is a proxy or a real better-sqlite3 - * connection — letting us skip the close-native / reopen-better-sqlite3 - * cost when there's nothing to backfill. - */ -function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { - const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); - const expected = new Set( - collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), - ); - - const existingNodeRows = ctx.db - .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'") - .all() as Array<{ file: string }>; - const existingNodes = new Set(existingNodeRows.map((r) => r.file)); - - let existingHashes = new Set(); - try { - const existingHashRows = ctx.db - .prepare('SELECT DISTINCT file FROM file_hashes') - .all() as Array<{ file: string }>; - existingHashes = new Set(existingHashRows.map((r) => r.file)); - } catch (e) { - // file_hashes table may not exist on legacy DBs; treat as fully missing - // so the backfill writes rows on the upsert path below. - debug( - `detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, - ); - } - - const installedExts = getInstalledWasmExtensions(); - const missingRel: string[] = []; - const missingAbs: string[] = []; - for (const rel of expected) { - if (existingNodes.has(rel) && existingHashes.has(rel)) continue; - const ext = path.extname(rel).toLowerCase(); - if (!installedExts.has(ext)) continue; - missingRel.push(rel); - missingAbs.push(path.join(ctx.rootDir, rel)); - } - - const staleRel = computeWasmOnlyStaleFiles({ - existingNodes, - existingHashes, - expected, - installedExts, - nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, - }); - - return { missingRel, missingAbs, staleRel }; -} - -/** - * Backfill files that the native orchestrator silently dropped during parse. - * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). - * - * Also purges stale rows for WASM-only files deleted from disk (#1073), which - * Rust's `detect_removed_files` filter (#1070) skips. - * - * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller - * can use the same scan for both gating and the actual backfill — avoiding - * a redundant fs walk when the orchestrator's signals already triggered. - */ -async function backfillNativeDroppedFiles( - ctx: PipelineContext, - gap: DroppedLanguageGap, -): Promise { - const { missingRel, missingAbs, staleRel } = gap; - if (missingAbs.length === 0 && staleRel.length === 0) return; - - // Now that we know there's work to do, hand off to better-sqlite3 (needed - // for the INSERT path below). - if (ctx.nativeFirstProxy) { - closeNativeDb(ctx, 'pre-parity-backfill'); - ctx.db = openDb(ctx.dbPath); - ctx.nativeFirstProxy = false; - } - - const dbConn = ctx.db as unknown as BetterSqlite3Database; - - // Purge WASM-only files that were deleted from disk (#1073). Rust's - // detect_removed_files skips them and the insert path below never visits - // them, so without this their rows would persist across rebuilds until the - // next full rebuild reset the DB. - if (staleRel.length > 0) { - // `computeWasmOnlyStaleFiles` guarantees every path here has an extension - // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would - // always bucket 100% into `unsupported-by-native`. Build the extension - // summary directly to avoid a redundant classification pass. - const staleByExt = groupByExtension(staleRel); - info( - `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`, - ); - purgeFilesData(dbConn, staleRel); - } - - if (missingAbs.length === 0) return; - - // Classify drops so users see per-extension reasons instead of just a count - // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust - // extractor); `native-extractor-failure` indicates a real native bug since - // the language IS supported by the addon yet the file was dropped anyway. - const { byReason, totals } = classifyNativeDrops(missingRel); - if (totals['unsupported-by-native'] > 0) { - info( - `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`, - ); - } - if (totals['native-extractor-failure'] > 0) { - warn( - `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`, - ); - } - const wasmResults = await parseFilesWasmForBackfill(missingAbs, ctx.rootDir); - - const rows: unknown[][] = []; - const exportKeys: unknown[][] = []; - for (const [relPath, symbols] of wasmResults) { - // File row — mirrors insertDefinitionsAndExports: qualified_name is null. - rows.push([relPath, 'file', relPath, 0, null, null, null, null, null]); - for (const def of symbols.definitions ?? []) { - // Populate qualified_name/scope the same way the JS fallback does so - // downstream queries (cross-file references, "go to definition") find - // these symbols. - const dotIdx = def.name.lastIndexOf('.'); - const scope = dotIdx !== -1 ? def.name.slice(0, dotIdx) : null; - rows.push([ - def.name, - def.kind, - relPath, - def.line, - def.endLine ?? null, - null, - def.name, - scope, - def.visibility ?? null, - ]); - } - // Exports: insert the row (INSERT OR IGNORE — a matching definition row - // is a no-op) and queue a key for the second-pass exported=1 update, so - // queries filtering on exported=1 find backfilled symbols (#970). - for (const exp of symbols.exports ?? []) { - rows.push([exp.name, exp.kind, relPath, exp.line, null, null, exp.name, null, null]); - exportKeys.push([exp.name, exp.kind, relPath, exp.line]); - } - } - const db = dbConn; - batchInsertNodes(db, rows); - - // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. - if (exportKeys.length > 0) { - const EXPORT_CHUNK = 500; - const exportStmtCache = new Map(); - for (let i = 0; i < exportKeys.length; i += EXPORT_CHUNK) { - const end = Math.min(i + EXPORT_CHUNK, exportKeys.length); - const chunkSize = end - i; - let updateStmt = exportStmtCache.get(chunkSize); - if (!updateStmt) { - const conditions = Array.from( - { length: chunkSize }, - () => '(name = ? AND kind = ? AND file = ? AND line = ?)', - ).join(' OR '); - updateStmt = db.prepare(`UPDATE nodes SET exported = 1 WHERE ${conditions}`); - exportStmtCache.set(chunkSize, updateStmt); - } - const vals: unknown[] = []; - for (let j = i; j < end; j++) { - const k = exportKeys[j] as unknown[]; - vals.push(k[0], k[1], k[2], k[3]); - } - updateStmt.run(...vals); - } - } - - // Persist file_hashes rows for every backfilled file. The Rust orchestrator - // only hashes files it parsed itself, so without this step files in - // optional-language extensions (e.g. .clj when no Rust extractor exists) - // would be missing from `file_hashes` — permanently breaking the JS-side - // fast-skip pre-flight (#1054), which rejects on `collected file missing - // from file_hashes` and forces every no-op rebuild back through the full - // ~2s native pipeline (#1068). - // - // Iterates `missingRel` (every collected file the Rust orchestrator - // dropped), not `wasmResults`, so files that produced zero symbols still - // get a row. - try { - const upsertHash = db.prepare( - 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', - ); - const writeHashes = db.transaction(() => { - for (let i = 0; i < missingRel.length; i++) { - const relPath = missingRel[i]; - const absPath = missingAbs[i]; - if (!relPath || !absPath) continue; - let code: string | null; - try { - code = readFileSafe(absPath); - } catch (e) { - debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`); - continue; - } - if (code === null) continue; - const stat = fileStat(absPath); - const mtime = stat ? stat.mtime : 0; - const size = stat ? stat.size : 0; - upsertHash.run(relPath, fileHash(code), mtime, size); - } - }); - writeHashes(); - } catch (e) { - debug( - `backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`, - ); - } - - // Free WASM parse trees from the inline backfill path (#1058). - // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree - // backed by WASM linear memory) on every result, but these symbols are - // consumed locally for DB row construction and never added to - // `ctx.allSymbols`, so the finalize-stage `releaseWasmTrees` sweep never - // sees them. Without this, trees leak WASM memory until process exit — - // bounded per run but cumulative across in-process integration tests. - // Mirrors the cleanup discipline established for #931. - for (const [, symbols] of wasmResults) { - const tree = (symbols as { _tree?: { delete?: () => void } })._tree; - if (tree && typeof tree.delete === 'function') { - try { - tree.delete(); - } catch { - /* ignore cleanup errors */ - } - } - (symbols as { _tree?: unknown; _langId?: unknown })._tree = undefined; - (symbols as { _tree?: unknown; _langId?: unknown })._langId = undefined; - } -} +// Native db lifecycle and orchestrator helpers live in dedicated stage +// modules — see `./stages/native-db-lifecycle.ts` and `./stages/native-orchestrator.ts`. // ── Pipeline stages execution ─────────────────────────────────────────── diff --git a/src/domain/graph/builder/stages/build-edges.ts b/src/domain/graph/builder/stages/build-edges.ts index fc08160b3..9a531ed5c 100644 --- a/src/domain/graph/builder/stages/build-edges.ts +++ b/src/domain/graph/builder/stages/build-edges.ts @@ -89,12 +89,74 @@ function setupNodeLookups(ctx: PipelineContext, allNodes: QueryNodeRow[]): void // ── Import edges ──────────────────────────────────────────────────────── +/** Pick the edge kind for an import statement based on its modifiers. */ +function importEdgeKind(imp: Import): string { + if (imp.reexport) return 'reexports'; + if (imp.typeOnly) return 'imports-type'; + if (imp.dynamicImport) return 'dynamic-imports'; + return 'imports'; +} + +/** + * For a `import type` statement, emit symbol-level `imports-type` edges so + * the target symbols get fan-in credit and aren't classified as dead code. + */ +function emitTypeOnlySymbolEdges( + ctx: PipelineContext, + imp: Import, + resolvedPath: string, + fileNodeId: number, + allEdgeRows: EdgeRowTuple[], +): void { + if (!ctx.nodesByNameAndFile) return; + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + let targetFile = resolvedPath; + if (isBarrelFile(ctx, resolvedPath)) { + const actual = resolveBarrelExport(ctx, resolvedPath, cleanName); + if (actual) targetFile = actual; + } + const candidates = ctx.nodesByNameAndFile.get(`${cleanName}|${targetFile}`); + if (candidates && candidates.length > 0) { + allEdgeRows.push([fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0]); + } + } +} + +/** + * Process a single import statement and emit all resulting edges (file→file, + * type-only symbol-level, and barrel re-export targets). + */ +function emitEdgesForImport( + ctx: PipelineContext, + imp: Import, + fileNodeId: number, + relPath: string, + getNodeIdStmt: NodeIdStmt, + allEdgeRows: EdgeRowTuple[], +): void { + const resolvedPath = getResolved(ctx, path.join(ctx.rootDir, relPath), imp.source); + const targetRow = getNodeIdStmt.get(resolvedPath, 'file', resolvedPath, 0); + if (!targetRow) return; + + const edgeKind = importEdgeKind(imp); + allEdgeRows.push([fileNodeId, targetRow.id, edgeKind, 1.0, 0]); + + if (imp.typeOnly) { + emitTypeOnlySymbolEdges(ctx, imp, resolvedPath, fileNodeId, allEdgeRows); + } + + if (!imp.reexport && isBarrelFile(ctx, resolvedPath)) { + buildBarrelEdges(ctx, imp, resolvedPath, fileNodeId, edgeKind, getNodeIdStmt, allEdgeRows); + } +} + function buildImportEdges( ctx: PipelineContext, getNodeIdStmt: NodeIdStmt, allEdgeRows: EdgeRowTuple[], ): void { - const { fileSymbols, barrelOnlyFiles, rootDir } = ctx; + const { fileSymbols, barrelOnlyFiles } = ctx; for (const [relPath, symbols] of fileSymbols) { const isBarrelOnly = barrelOnlyFiles.has(relPath); @@ -105,40 +167,7 @@ function buildImportEdges( for (const imp of symbols.imports) { // Barrel-only files: only emit reexport edges, skip regular imports if (isBarrelOnly && !imp.reexport) continue; - - const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); - const targetRow = getNodeIdStmt.get(resolvedPath, 'file', resolvedPath, 0); - if (!targetRow) continue; - - const edgeKind = imp.reexport - ? 'reexports' - : imp.typeOnly - ? 'imports-type' - : imp.dynamicImport - ? 'dynamic-imports' - : 'imports'; - allEdgeRows.push([fileNodeId, targetRow.id, edgeKind, 1.0, 0]); - - // Type-only imports: create symbol-level edges so the target symbols - // get fan-in credit and aren't falsely classified as dead code. - if (imp.typeOnly && ctx.nodesByNameAndFile) { - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - let targetFile = resolvedPath; - if (isBarrelFile(ctx, resolvedPath)) { - const actual = resolveBarrelExport(ctx, resolvedPath, cleanName); - if (actual) targetFile = actual; - } - const candidates = ctx.nodesByNameAndFile.get(`${cleanName}|${targetFile}`); - if (candidates && candidates.length > 0) { - allEdgeRows.push([fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0]); - } - } - } - - if (!imp.reexport && isBarrelFile(ctx, resolvedPath)) { - buildBarrelEdges(ctx, imp, resolvedPath, fileNodeId, edgeKind, getNodeIdStmt, allEdgeRows); - } + emitEdgesForImport(ctx, imp, fileNodeId, relPath, getNodeIdStmt, allEdgeRows); } } } @@ -174,83 +203,98 @@ function buildBarrelEdges( // ── Import edges (native engine) ──────────────────────────────────────── -function buildImportEdgesNative( - ctx: PipelineContext, - getNodeIdStmt: NodeIdStmt, - allEdgeRows: EdgeRowTuple[], - native: NativeAddon, -): void { - const { fileSymbols, barrelOnlyFiles, rootDir } = ctx; +/** Native FFI input shape for a single import statement. */ +interface NativeImportInfo { + source: string; + names: string[]; + reexport: boolean; + typeOnly: boolean; + dynamicImport: boolean; + wildcardReexport: boolean; +} - // 1. Build per-file input data - const files: Array<{ - file: string; - fileNodeId: number; - isBarrelOnly: boolean; - imports: Array<{ - source: string; - names: string[]; - reexport: boolean; - typeOnly: boolean; - dynamicImport: boolean; - wildcardReexport: boolean; - }>; - definitionNames: string[]; - }> = []; - - // Collect all file node IDs we'll need (sources + targets) - const fileNodeIds: Array<{ file: string; nodeId: number }> = []; - const seenNodeFiles = new Set(); - - const addFileNodeId = (relPath: string): { id: number } | undefined => { - if (seenNodeFiles.has(relPath)) return fileNodeRowCache.get(relPath); - const row = getNodeIdStmt.get(relPath, 'file', relPath, 0); - if (row) { - seenNodeFiles.add(relPath); - fileNodeIds.push({ file: relPath, nodeId: row.id }); - fileNodeRowCache.set(relPath, row); - } - return row; +/** Native FFI input shape for a single file. */ +interface NativeFileInput { + file: string; + fileNodeId: number; + isBarrelOnly: boolean; + imports: NativeImportInfo[]; + definitionNames: string[]; +} + +/** Native FFI input shape for re-exports of a single file. */ +interface NativeReexportInput { + file: string; + reexports: Array<{ source: string; names: string[]; wildcardReexport: boolean }>; +} + +/** Lazily-resolving cache of file-node rows for the native input arrays. */ +interface FileNodeIdRegistry { + ids: Array<{ file: string; nodeId: number }>; + add(relPath: string): { id: number } | undefined; +} + +function createFileNodeIdRegistry(getNodeIdStmt: NodeIdStmt): FileNodeIdRegistry { + const ids: Array<{ file: string; nodeId: number }> = []; + const seen = new Set(); + const cache = new Map(); + return { + ids, + add(relPath: string) { + if (seen.has(relPath)) return cache.get(relPath); + const row = getNodeIdStmt.get(relPath, 'file', relPath, 0); + if (row) { + seen.add(relPath); + ids.push({ file: relPath, nodeId: row.id }); + cache.set(relPath, row); + } + return row; + }, }; - const fileNodeRowCache = new Map(); +} - // 2. Pre-resolve all imports and build resolved imports array. - // Keys use forward-slash-normalized rootDir + "/" + relPath to match the Rust - // lookup format (format!("{}/{}", root_dir.replace('\\', "/"), file)). - // On Windows, rootDir has backslashes but Rust normalizes them — the JS side - // must do the same or every resolve key lookup misses (#750). - const resolvedImports: Array<{ key: string; resolvedPath: string }> = []; +function toNativeImportInfo(imp: Import): NativeImportInfo { + return { + source: imp.source, + names: imp.names, + reexport: !!imp.reexport, + typeOnly: !!imp.typeOnly, + dynamicImport: !!imp.dynamicImport, + wildcardReexport: !!imp.wildcardReexport, + }; +} + +/** + * Pre-resolve every import for the given files, registering each resolved + * target with the registry so the native side has full node-id coverage. + * + * Resolved-import keys use forward-slash-normalized rootDir + "/" + relPath to + * match the Rust lookup format. On Windows, rootDir has backslashes but Rust + * normalizes them — the JS side must do the same or every key lookup misses + * (#750). + */ +function buildNativeFileInputs( + ctx: PipelineContext, + registry: FileNodeIdRegistry, +): { + files: NativeFileInput[]; + resolvedImports: Array<{ key: string; resolvedPath: string }>; +} { + const { fileSymbols, barrelOnlyFiles, rootDir } = ctx; const fwdRootDir = rootDir.replace(/\\/g, '/'); + const files: NativeFileInput[] = []; + const resolvedImports: Array<{ key: string; resolvedPath: string }> = []; for (const [relPath, symbols] of fileSymbols) { - const fileNodeRow = addFileNodeId(relPath); + const fileNodeRow = registry.add(relPath); if (!fileNodeRow) continue; - const importInfos: Array<{ - source: string; - names: string[]; - reexport: boolean; - typeOnly: boolean; - dynamicImport: boolean; - wildcardReexport: boolean; - }> = []; - + const importInfos: NativeImportInfo[] = []; for (const imp of symbols.imports) { - // Pre-resolve and register target file node const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); - addFileNodeId(resolvedPath); - - // Key matches Rust's format!("{}/{}", root_dir.replace('\\', "/"), file_input.file) + registry.add(resolvedPath); resolvedImports.push({ key: `${fwdRootDir}/${relPath}|${imp.source}`, resolvedPath }); - - importInfos.push({ - source: imp.source, - names: imp.names, - reexport: !!imp.reexport, - typeOnly: !!imp.typeOnly, - dynamicImport: !!imp.dynamicImport, - wildcardReexport: !!imp.wildcardReexport, - }); + importInfos.push(toNativeImportInfo(imp)); } files.push({ @@ -261,61 +305,75 @@ function buildImportEdgesNative( definitionNames: symbols.definitions.map((d) => d.name), }); } + return { files, resolvedImports }; +} - // 4. Flatten reexportMap - const fileReexports: Array<{ - file: string; - reexports: Array<{ - source: string; - names: string[]; - wildcardReexport: boolean; - }>; - }> = []; - if (ctx.reexportMap) { - for (const [file, entries] of ctx.reexportMap) { - const reexports = ( - entries as Array<{ source: string; names: string[]; wildcardReexport: boolean }> - ).map((re) => ({ - source: re.source, - names: re.names, - wildcardReexport: !!re.wildcardReexport, - })); - fileReexports.push({ file, reexports }); +/** Flatten `ctx.reexportMap` into the array shape the native side expects. */ +function buildNativeReexports( + ctx: PipelineContext, + registry: FileNodeIdRegistry, +): NativeReexportInput[] { + const fileReexports: NativeReexportInput[] = []; + if (!ctx.reexportMap) return fileReexports; + + for (const [file, entries] of ctx.reexportMap) { + const reexports = ( + entries as Array<{ source: string; names: string[]; wildcardReexport: boolean }> + ).map((re) => ({ + source: re.source, + names: re.names, + wildcardReexport: !!re.wildcardReexport, + })); + fileReexports.push({ file, reexports }); - // Register reexport target files for node ID lookup - for (const re of reexports) { - addFileNodeId(re.source); - } + for (const re of reexports) { + registry.add(re.source); } } + return fileReexports; +} - // 5. Compute barrel file list +function collectBarrelFiles(ctx: PipelineContext): string[] { const barrelFiles: string[] = []; - for (const [relPath] of fileSymbols) { - if (isBarrelFile(ctx, relPath)) { - barrelFiles.push(relPath); - } + for (const [relPath] of ctx.fileSymbols) { + if (isBarrelFile(ctx, relPath)) barrelFiles.push(relPath); } + return barrelFiles; +} - // 6. Build symbol node entries for type-only import resolution +function collectSymbolNodes( + ctx: PipelineContext, +): Array<{ name: string; file: string; nodeId: number }> { const symbolNodes: Array<{ name: string; file: string; nodeId: number }> = []; - if (ctx.nodesByNameAndFile) { - for (const [key, nodes] of ctx.nodesByNameAndFile) { - if (nodes.length > 0) { - const [name, file] = key.split('|'); - symbolNodes.push({ name: name!, file: file!, nodeId: nodes[0]!.id }); - } - } + if (!ctx.nodesByNameAndFile) return symbolNodes; + for (const [key, nodes] of ctx.nodesByNameAndFile) { + if (nodes.length === 0) continue; + const [name, file] = key.split('|'); + symbolNodes.push({ name: name!, file: file!, nodeId: nodes[0]!.id }); } + return symbolNodes; +} + +function buildImportEdgesNative( + ctx: PipelineContext, + getNodeIdStmt: NodeIdStmt, + allEdgeRows: EdgeRowTuple[], + native: NativeAddon, +): void { + const registry = createFileNodeIdRegistry(getNodeIdStmt); + + const { files, resolvedImports } = buildNativeFileInputs(ctx, registry); + const fileReexports = buildNativeReexports(ctx, registry); + const barrelFiles = collectBarrelFiles(ctx); + const symbolNodes = collectSymbolNodes(ctx); - // 7. Call native const nativeEdges = native.buildImportEdges!( files, resolvedImports, fileReexports, - fileNodeIds, + registry.ids, barrelFiles, - rootDir, + ctx.rootDir, symbolNodes, ) as NativeEdge[]; diff --git a/src/domain/graph/builder/stages/build-structure.ts b/src/domain/graph/builder/stages/build-structure.ts index 1a59353be..144537dfe 100644 --- a/src/domain/graph/builder/stages/build-structure.ts +++ b/src/domain/graph/builder/stages/build-structure.ts @@ -11,87 +11,104 @@ import type { ExtractorOutput } from '../../../../types.js'; import type { PipelineContext } from '../context.js'; import { readFileSafe } from '../helpers.js'; -export async function buildStructure(ctx: PipelineContext): Promise { - const { db, fileSymbols, rootDir, discoveredDirs, allSymbols, isFullBuild } = ctx; - - // Build line count map (prefer cached _lineCount from parser) +/** Populate `ctx.lineCountMap` from cached parser results, falling back to disk. */ +function populateLineCountMap(ctx: PipelineContext): void { + const { fileSymbols, rootDir } = ctx; ctx.lineCountMap = new Map(); for (const [relPath, symbols] of fileSymbols) { const lineCount = (symbols as ExtractorOutput & { lineCount?: number }).lineCount ?? symbols._lineCount; if (lineCount) { ctx.lineCountMap.set(relPath, lineCount); - } else { - const absPath = path.join(rootDir, relPath); - try { - const content = readFileSafe(absPath); - ctx.lineCountMap.set(relPath, content.split('\n').length); - } catch { - ctx.lineCountMap.set(relPath, 0); - } + continue; + } + const absPath = path.join(rootDir, relPath); + try { + const content = readFileSafe(absPath); + ctx.lineCountMap.set(relPath, content.split('\n').length); + } catch { + ctx.lineCountMap.set(relPath, 0); } } +} - const changedFileList = isFullBuild ? null : [...allSymbols.keys()]; - - // For small incremental builds on large codebases, use a fast path that - // updates only the changed files' metrics via targeted SQL instead of - // loading ALL definitions from DB (~8ms) and recomputing ALL metrics (~15ms). - // Gate: ≤smallFilesThreshold changed files AND significantly more existing files (>20) to - // avoid triggering on small test fixtures where directory metrics matter. +/** Count file-kind nodes already in the DB, preferring the native connection. */ +function countExistingFiles(ctx: PipelineContext): number { const useNativeReads = ctx.engineName === 'native' && !!ctx.nativeDb; - const existingFileCount = !isFullBuild - ? ( - (useNativeReads - ? ctx.nativeDb!.queryGet("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'", []) - : db.prepare("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'").get()) as { - c: number; - } - ).c - : 0; - const useSmallIncrementalFastPath = - !isFullBuild && - changedFileList != null && - changedFileList.length <= ctx.config.build.smallFilesThreshold && - existingFileCount > 20; - - if (!isFullBuild && !useSmallIncrementalFastPath) { - // Medium/large incremental: load unchanged files from DB for complete structure - loadUnchangedFilesFromDb(ctx); - } + const row = ( + useNativeReads + ? ctx.nativeDb!.queryGet("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'", []) + : ctx.db.prepare("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'").get() + ) as { c: number }; + return row.c; +} - // Build directory structure - const t0 = performance.now(); +/** + * Build directory structure + metrics. Chooses between the fast incremental + * path (a handful of files changed on a large codebase) and the full path + * (delegated to `features/structure`). + */ +async function buildDirectoryStructure( + ctx: PipelineContext, + changedFileList: string[] | null, + useSmallIncrementalFastPath: boolean, +): Promise { if (useSmallIncrementalFastPath) { updateChangedFileMetrics(ctx, changedFileList!); - } else { - const relDirs = new Set(); - for (const absDir of discoveredDirs) { - relDirs.add(normalizePath(path.relative(rootDir, absDir))); - } - try { - const { buildStructure: buildStructureFn } = (await import( - '../../../../features/structure.js' - )) as { - buildStructure: ( - db: PipelineContext['db'], - fileSymbols: Map, - rootDir: string, - lineCountMap: Map, - directories: Set, - changedFiles: string[] | null, - ) => void; - }; - const changedFilePaths = isFullBuild ? null : [...allSymbols.keys()]; - buildStructureFn(db, fileSymbols, rootDir, ctx.lineCountMap, relDirs, changedFilePaths); - } catch (err) { - debug(`Structure analysis failed: ${(err as Error).message}`); - } + return; } - ctx.timing.structureMs = performance.now() - t0; - // Classify node roles (incremental: only reclassify changed files' nodes) - const t1 = performance.now(); + const { db, fileSymbols, rootDir, discoveredDirs, allSymbols, isFullBuild } = ctx; + const relDirs = new Set(); + for (const absDir of discoveredDirs) { + relDirs.add(normalizePath(path.relative(rootDir, absDir))); + } + try { + const { buildStructure: buildStructureFn } = (await import( + '../../../../features/structure.js' + )) as { + buildStructure: ( + db: PipelineContext['db'], + fileSymbols: Map, + rootDir: string, + lineCountMap: Map, + directories: Set, + changedFiles: string[] | null, + ) => void; + }; + const changedFilePaths = isFullBuild ? null : [...allSymbols.keys()]; + buildStructureFn(db, fileSymbols, rootDir, ctx.lineCountMap, relDirs, changedFilePaths); + } catch (err) { + debug(`Structure analysis failed: ${(err as Error).message}`); + } +} + +/** Convert a `NativeDatabase.classifyRoles*` result into the JS summary shape. */ +function nativeRoleSummaryToRecord( + nativeResult: NonNullable< + ReturnType['classifyRolesFull']> + >, +): Record { + return { + entry: nativeResult.entry, + core: nativeResult.core, + utility: nativeResult.utility, + adapter: nativeResult.adapter, + dead: nativeResult.dead, + 'dead-leaf': nativeResult.deadLeaf, + 'dead-entry': nativeResult.deadEntry, + 'dead-ffi': nativeResult.deadFfi, + 'dead-unresolved': nativeResult.deadUnresolved, + 'test-only': nativeResult.testOnly, + leaf: nativeResult.leaf, + }; +} + +async function classifyRoles( + ctx: PipelineContext, + changedFileList: string[] | null, +): Promise { + const useNativeReads = ctx.engineName === 'native' && !!ctx.nativeDb; try { let roleSummary: Record | null = null; @@ -103,24 +120,9 @@ export async function buildStructure(ctx: PipelineContext): Promise { changedFileList && changedFileList.length > 0 ? ctx.nativeDb.classifyRolesIncremental(changedFileList) : ctx.nativeDb.classifyRolesFull(); - if (nativeResult) { - roleSummary = { - entry: nativeResult.entry, - core: nativeResult.core, - utility: nativeResult.utility, - adapter: nativeResult.adapter, - dead: nativeResult.dead, - 'dead-leaf': nativeResult.deadLeaf, - 'dead-entry': nativeResult.deadEntry, - 'dead-ffi': nativeResult.deadFfi, - 'dead-unresolved': nativeResult.deadUnresolved, - 'test-only': nativeResult.testOnly, - leaf: nativeResult.leaf, - }; - } + if (nativeResult) roleSummary = nativeRoleSummaryToRecord(nativeResult); } - // Fall back to JS path if (!roleSummary) { const { classifyNodeRoles } = (await import('../../../../features/structure.js')) as { classifyNodeRoles: ( @@ -141,6 +143,37 @@ export async function buildStructure(ctx: PipelineContext): Promise { } catch (err) { debug(`Role classification failed: ${(err as Error).message}`); } +} + +export async function buildStructure(ctx: PipelineContext): Promise { + const { allSymbols, isFullBuild } = ctx; + + populateLineCountMap(ctx); + + const changedFileList = isFullBuild ? null : [...allSymbols.keys()]; + + // For small incremental builds on large codebases, use a fast path that + // updates only the changed files' metrics via targeted SQL instead of + // loading ALL definitions from DB (~8ms) and recomputing ALL metrics (~15ms). + // Gate: ≤smallFilesThreshold changed files AND significantly more existing files (>20) to + // avoid triggering on small test fixtures where directory metrics matter. + const existingFileCount = !isFullBuild ? countExistingFiles(ctx) : 0; + const useSmallIncrementalFastPath = + !isFullBuild && + changedFileList != null && + changedFileList.length <= ctx.config.build.smallFilesThreshold && + existingFileCount > 20; + + if (!isFullBuild && !useSmallIncrementalFastPath) { + loadUnchangedFilesFromDb(ctx); + } + + const t0 = performance.now(); + await buildDirectoryStructure(ctx, changedFileList, useSmallIncrementalFastPath); + ctx.timing.structureMs = performance.now() - t0; + + const t1 = performance.now(); + await classifyRoles(ctx, changedFileList); ctx.timing.rolesMs = performance.now() - t1; } diff --git a/src/domain/graph/builder/stages/detect-changes.ts b/src/domain/graph/builder/stages/detect-changes.ts index cc51155dc..222d92e42 100644 --- a/src/domain/graph/builder/stages/detect-changes.ts +++ b/src/domain/graph/builder/stages/detect-changes.ts @@ -162,14 +162,14 @@ function tryJournalTier( return { changed, removed: [...removedSet], isFullBuild: false }; } -function mtimeAndHashTiers( +/** Tier 1: mtime+size triage. Returns the files that still need hashing. */ +function tierMtimeSize( existing: Map, allFiles: string[], rootDir: string, - removed: string[], -): ChangeResult { +): { needsHash: NeedsHashItem[]; skipped: number } { const needsHash: NeedsHashItem[] = []; - const skipped: string[] = []; + let skipped = 0; for (const file of allFiles) { const relPath = normalizePath(path.relative(rootDir, file)); @@ -183,16 +183,17 @@ function mtimeAndHashTiers( const storedMtime = record.mtime || 0; const storedSize = record.size || 0; if (storedSize > 0 && stat.mtime === storedMtime && stat.size === storedSize) { - skipped.push(relPath); + skipped++; continue; } needsHash.push({ file, relPath, stat }); } - if (needsHash.length > 0) { - debug(`Tier 1: ${skipped.length} skipped by mtime+size, ${needsHash.length} need hash check`); - } + return { needsHash, skipped }; +} +/** Tier 2: hash candidates from tier 1, classifying changed vs metadata-only. */ +function tierHash(existing: Map, needsHash: NeedsHashItem[]): ChangedFile[] { const changed: ChangedFile[] = []; for (const item of needsHash) { let content: string | undefined; @@ -217,11 +218,26 @@ function mtimeAndHashTiers( }); } } + return changed; +} + +function mtimeAndHashTiers( + existing: Map, + allFiles: string[], + rootDir: string, + removed: string[], +): ChangeResult { + const { needsHash, skipped } = tierMtimeSize(existing, allFiles, rootDir); + if (needsHash.length > 0) { + debug(`Tier 1: ${skipped} skipped by mtime+size, ${needsHash.length} need hash check`); + } + + const changed = tierHash(existing, needsHash); - const parseChanged = changed.filter((c) => !c.metadataOnly); if (needsHash.length > 0) { + const parseChangedLen = changed.filter((c) => !c.metadataOnly).length; debug( - `Tier 2: ${parseChanged.length} actually changed, ${changed.length - parseChanged.length} metadata-only`, + `Tier 2: ${parseChangedLen} actually changed, ${changed.length - parseChangedLen} metadata-only`, ); } @@ -512,61 +528,43 @@ function handleIncrementalBuild(ctx: PipelineContext): void { purgeAndAddReverseDeps(ctx, changePaths, reverseDeps); } -/** - * Read-only pre-flight check for the native orchestrator. - * - * Returns true iff every collected source file has matching mtime+size in - * `file_hashes` and no DB-tracked file has been removed. When true, the - * caller can short-circuit before invoking the native orchestrator — - * matching WASM's ~20 ms early-exit path and avoiding the ~2s flat - * per-call native rebuild overhead seen in CI (#1054). - * - * Intentionally Tier-0/Tier-1 only (journal + mtime/size). Tier-2 content - * hashing is left to the native side: when this returns false the caller - * falls through to the orchestrator, which performs its own complete - * detection and is the source of truth. - * - * Conservatively returns false when CFG or dataflow analysis is enabled - * but the corresponding tables are empty — otherwise the fast-skip would - * silently suppress the pending-analysis pass that the JS path runs via - * `runPendingAnalysis`, and CFG/dataflow data would never populate on - * repos where source files don't change between builds. - * - * Pure read of `db` and the filesystem — never mutates either. - */ -export function detectNoChanges( - db: BetterSqlite3Database, - allFiles: string[], - rootDir: string, - opts?: Record, -): boolean { - // Diagnostic logging gated by env var — used by the bench gate to surface - // why the fast-skip is not firing on CI runners (#1066). Off by default to - // avoid noise on every regular incremental build. +/** Diagnostic logger gated by env var, used by both `detectNoChanges` branches. */ +function makeFastSkipLogger(): (reason: string) => void { const diag = process.env.CODEGRAPH_FAST_SKIP_DIAG === '1'; - const log = (reason: string): void => { + return (reason: string): void => { if (diag) info(`[fast-skip] ${reason}`); }; +} - let hasTable = false; +/** + * Load the `file_hashes` table for the no-change pre-flight. Returns null + * if the table is missing or empty (both → caller must fall through). + */ +function loadFileHashesForPreflight( + db: BetterSqlite3Database, + log: (reason: string) => void, +): Map | null { try { db.prepare('SELECT 1 FROM file_hashes LIMIT 1').get(); - hasTable = true; } catch { - /* table missing — first build */ - } - if (!hasTable) { log('false: file_hashes table missing'); - return false; + return null; } - const rows = db.prepare('SELECT file, hash, mtime, size FROM file_hashes').all() as FileHashRow[]; if (rows.length === 0) { log('false: file_hashes table empty'); - return false; + return null; } - const existing = new Map(rows.map((r) => [r.file, r])); + return new Map(rows.map((r) => [r.file, r])); +} +/** Returns true iff every file in `allFiles` matches a stored mtime+size record. */ +function allFilesMatchStoredStat( + existing: Map, + allFiles: string[], + rootDir: string, + log: (reason: string) => void, +): boolean { const currentFiles = new Set(); for (const file of allFiles) { currentFiles.add(normalizePath(path.relative(rootDir, file))); @@ -603,21 +601,66 @@ export function detectNoChanges( return false; } } + return true; +} - // Pending-analysis guard: if CFG/dataflow is enabled but the corresponding - // table is empty (analysis newly enabled, or tables wiped between builds), - // fall through so the orchestrator / JS pipeline can run runPendingAnalysis. - // Mirrors the check at the top of runPendingAnalysis (see line ~244). - if (opts) { - if (opts.cfg !== false && hasEmptyAnalysisTable(db, 'cfg_blocks')) { - log('false: pending-analysis guard — cfg_blocks is empty'); - return false; - } - if (opts.dataflow !== false && hasEmptyAnalysisTable(db, 'dataflow')) { - log('false: pending-analysis guard — dataflow is empty'); - return false; - } +/** + * Pending-analysis guard: if CFG/dataflow is enabled but the corresponding + * table is empty (analysis newly enabled, or tables wiped between builds), + * fall through so the orchestrator / JS pipeline can run runPendingAnalysis. + * Mirrors the check at the top of runPendingAnalysis. + */ +function passesPendingAnalysisGuard( + db: BetterSqlite3Database, + opts: Record | undefined, + log: (reason: string) => void, +): boolean { + if (!opts) return true; + if (opts.cfg !== false && hasEmptyAnalysisTable(db, 'cfg_blocks')) { + log('false: pending-analysis guard — cfg_blocks is empty'); + return false; } + if (opts.dataflow !== false && hasEmptyAnalysisTable(db, 'dataflow')) { + log('false: pending-analysis guard — dataflow is empty'); + return false; + } + return true; +} + +/** + * Read-only pre-flight check for the native orchestrator. + * + * Returns true iff every collected source file has matching mtime+size in + * `file_hashes` and no DB-tracked file has been removed. When true, the + * caller can short-circuit before invoking the native orchestrator — + * matching WASM's ~20 ms early-exit path and avoiding the ~2s flat + * per-call native rebuild overhead seen in CI (#1054). + * + * Intentionally Tier-0/Tier-1 only (journal + mtime/size). Tier-2 content + * hashing is left to the native side: when this returns false the caller + * falls through to the orchestrator, which performs its own complete + * detection and is the source of truth. + * + * Conservatively returns false when CFG or dataflow analysis is enabled + * but the corresponding tables are empty — otherwise the fast-skip would + * silently suppress the pending-analysis pass that the JS path runs via + * `runPendingAnalysis`, and CFG/dataflow data would never populate on + * repos where source files don't change between builds. + * + * Pure read of `db` and the filesystem — never mutates either. + */ +export function detectNoChanges( + db: BetterSqlite3Database, + allFiles: string[], + rootDir: string, + opts?: Record, +): boolean { + const log = makeFastSkipLogger(); + const existing = loadFileHashesForPreflight(db, log); + if (!existing) return false; + + if (!allFilesMatchStoredStat(existing, allFiles, rootDir, log)) return false; + if (!passesPendingAnalysisGuard(db, opts, log)) return false; log(`true: all checks passed (${allFiles.length} files)`); return true; diff --git a/src/domain/graph/builder/stages/finalize.ts b/src/domain/graph/builder/stages/finalize.ts index d59fe016a..ab2e1d429 100644 --- a/src/domain/graph/builder/stages/finalize.ts +++ b/src/domain/graph/builder/stages/finalize.ts @@ -136,82 +136,72 @@ function persistBuildMetadata( } } -/** - * Run advisory checks on full builds: orphaned embeddings, stale embeddings, - * and unused exports. Informational only — does not affect correctness. - */ -function runAdvisoryChecks(ctx: PipelineContext, hasEmbeddings: boolean, buildNow: Date): void { - // Batched native path: single napi call for all 3 advisory checks - if (ctx.engineName === 'native' && ctx.nativeDb?.runAdvisoryChecks) { - const result = ctx.nativeDb.runAdvisoryChecks(hasEmbeddings); - if (result.orphanedEmbeddings > 0) { - warn( - `${result.orphanedEmbeddings} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, - ); - } - if (result.embedBuiltAt) { - const embedTime = new Date(result.embedBuiltAt).getTime(); - if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { - warn( - 'Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.', - ); - } - } - if (result.unusedExports > 0) { - warn( - `${result.unusedExports} exported symbol${result.unusedExports > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`, - ); +/** Format the "X exports have zero consumers" warning, with correct plural agreement. */ +function unusedExportsMessage(count: number): string { + return `${count} exported symbol${count > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`; +} + +/** Run all three advisory checks via the batched native FFI. */ +function runAdvisoryChecksNative( + ctx: PipelineContext, + hasEmbeddings: boolean, + buildNow: Date, +): void { + const result = ctx.nativeDb!.runAdvisoryChecks!(hasEmbeddings); + if (result.orphanedEmbeddings > 0) { + warn( + `${result.orphanedEmbeddings} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, + ); + } + if (result.embedBuiltAt) { + const embedTime = new Date(result.embedBuiltAt).getTime(); + if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { + warn('Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.'); } - return; } + if (result.unusedExports > 0) { + warn(unusedExportsMessage(result.unusedExports)); + } +} - const { db } = ctx; - - // Orphaned embeddings warning - if (hasEmbeddings) { - try { - const orphaned = ( - db - .prepare( - 'SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)', - ) - .get() as { c: number } - ).c; - if (orphaned > 0) { - warn( - `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, - ); - } - } catch { - /* ignore - embeddings table may have been dropped */ +function checkOrphanedEmbeddings(ctx: PipelineContext): void { + try { + const orphaned = ( + ctx.db + .prepare('SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)') + .get() as { c: number } + ).c; + if (orphaned > 0) { + warn( + `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, + ); } + } catch { + /* ignore - embeddings table may have been dropped */ } +} - // Stale embeddings warning (built before current graph rebuild) - if (hasEmbeddings) { - try { - const embedBuiltAt = ( - db.prepare("SELECT value FROM embedding_meta WHERE key = 'built_at'").get() as - | { value: string } - | undefined - )?.value; - if (embedBuiltAt) { - const embedTime = new Date(embedBuiltAt).getTime(); - if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { - warn( - 'Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.', - ); - } - } - } catch { - /* ignore - embedding_meta table may not exist */ +function checkStaleEmbeddings(ctx: PipelineContext, buildNow: Date): void { + try { + const embedBuiltAt = ( + ctx.db.prepare("SELECT value FROM embedding_meta WHERE key = 'built_at'").get() as + | { value: string } + | undefined + )?.value; + if (!embedBuiltAt) return; + const embedTime = new Date(embedBuiltAt).getTime(); + if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { + warn('Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.'); } + } catch { + /* ignore - embedding_meta table may not exist */ } +} - // Unused exports warning +function checkUnusedExports(ctx: PipelineContext): void { try { const unusedCount = ( - db + ctx.db .prepare( `SELECT COUNT(*) as c FROM nodes WHERE exported = 1 AND kind != 'file' @@ -224,16 +214,28 @@ function runAdvisoryChecks(ctx: PipelineContext, hasEmbeddings: boolean, buildNo ) .get() as { c: number } ).c; - if (unusedCount > 0) { - warn( - `${unusedCount} exported symbol${unusedCount > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`, - ); - } + if (unusedCount > 0) warn(unusedExportsMessage(unusedCount)); } catch { /* exported column may not exist on older DBs */ } } +/** + * Run advisory checks on full builds: orphaned embeddings, stale embeddings, + * and unused exports. Informational only — does not affect correctness. + */ +function runAdvisoryChecks(ctx: PipelineContext, hasEmbeddings: boolean, buildNow: Date): void { + if (ctx.engineName === 'native' && ctx.nativeDb?.runAdvisoryChecks) { + runAdvisoryChecksNative(ctx, hasEmbeddings, buildNow); + return; + } + if (hasEmbeddings) { + checkOrphanedEmbeddings(ctx); + checkStaleEmbeddings(ctx, buildNow); + } + checkUnusedExports(ctx); +} + export async function finalize(ctx: PipelineContext): Promise { const { allSymbols, rootDir, isFullBuild, hasEmbeddings, opts } = ctx; diff --git a/src/domain/graph/builder/stages/insert-nodes.ts b/src/domain/graph/builder/stages/insert-nodes.ts index 88e403ec9..09aad25d8 100644 --- a/src/domain/graph/builder/stages/insert-nodes.ts +++ b/src/domain/graph/builder/stages/insert-nodes.ts @@ -92,23 +92,69 @@ function marshalSymbolBatches(allSymbols: Map): InsertN return batches; } +/** A single file_hashes row. */ +interface FileHashRecord { + file: string; + hash: string; + mtime: number; + size: number; +} + +/** Resolve the (hash, mtime, size) tuple for a relPath, reading from disk if needed. */ +function resolveHashFromPrecomputed( + relPath: string, + precomputed: PrecomputedFileData, + rootDir: string, + caller: string, +): FileHashRecord | null { + if (precomputed.hash) { + let mtime: number; + let size: number; + if (precomputed.stat) { + mtime = precomputed.stat.mtime; + size = precomputed.stat.size; + } else { + const rawStat = fileStat(path.join(rootDir, relPath)); + mtime = rawStat ? rawStat.mtime : 0; + size = rawStat ? rawStat.size : 0; + } + return { file: relPath, hash: precomputed.hash, mtime, size }; + } + + const absPath = path.join(rootDir, relPath); + let code: string | null; + try { + code = readFileSafe(absPath); + } catch (e) { + debug(`${caller}: readFileSafe failed for ${relPath}: ${toErrorMessage(e)}`); + code = null; + } + if (code === null) return null; + const stat = fileStat(absPath); + return { + file: relPath, + hash: fileHash(code), + mtime: stat ? stat.mtime : 0, + size: stat ? stat.size : 0, + }; +} + /** - * Build file hash entries for every collected file, including those that - * produced zero symbols (empty files, parsers that silently no-op'd, or - * optional-language extensions whose grammar wasn't installed). Iterating the - * symbol map instead would skip such files and leave them missing from - * `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on - * any subsequent no-op rebuild (#1068). + * Walk every collected file once and yield a `FileHashRecord` for it, plus one + * record per metadata-only update. Shared by `buildFileHashes` (native path) + * and `updateFileHashes` (JS fallback) so the iteration and hash-resolution + * logic stays in one place. * - * Exported for unit testing. + * Files marked `_reverseDepOnly` are skipped — their hashes are already + * correct in the DB. */ -export function buildFileHashes( +function* iterFileHashRecords( filesToParse: FileToParse[], precomputedData: Map, metadataUpdates: MetadataUpdate[], rootDir: string, -): Array<{ file: string; hash: string; mtime: number; size: number }> { - const fileHashes: Array<{ file: string; hash: string; mtime: number; size: number }> = []; + caller: string, +): Generator { const seen = new Set(); for (const item of filesToParse) { @@ -117,47 +163,53 @@ export function buildFileHashes( seen.add(relPath); const precomputed = precomputedData.get(relPath); - if (precomputed?._reverseDepOnly) { - continue; // file unchanged, hash already correct - } - if (precomputed?.hash) { - let mtime: number; - let size: number; - if (precomputed.stat) { - mtime = precomputed.stat.mtime; - size = precomputed.stat.size; - } else { - const rawStat = fileStat(path.join(rootDir, relPath)); - mtime = rawStat ? rawStat.mtime : 0; - size = rawStat ? rawStat.size : 0; - } - fileHashes.push({ file: relPath, hash: precomputed.hash, mtime, size }); - } else { - const absPath = path.join(rootDir, relPath); - let code: string | null; - try { - code = readFileSafe(absPath); - } catch (e) { - debug(`buildFileHashes: readFileSafe failed for ${relPath}: ${toErrorMessage(e)}`); - code = null; - } - if (code !== null) { - const stat = fileStat(absPath); - const mtime = stat ? stat.mtime : 0; - const size = stat ? stat.size : 0; - fileHashes.push({ file: relPath, hash: fileHash(code), mtime, size }); - } - } + if (precomputed?._reverseDepOnly) continue; + + const record = resolveHashFromPrecomputed( + relPath, + precomputed ?? ({} as PrecomputedFileData), + rootDir, + caller, + ); + if (record) yield record; } - // Also include metadata-only updates (self-heal mtime/size without re-parse) + // Metadata-only updates (self-heal mtime/size without re-parse) for (const item of metadataUpdates) { - const mtime = item.stat ? item.stat.mtime : 0; - const size = item.stat ? item.stat.size : 0; - fileHashes.push({ file: item.relPath, hash: item.hash, mtime, size }); + yield { + file: item.relPath, + hash: item.hash, + mtime: item.stat ? item.stat.mtime : 0, + size: item.stat ? item.stat.size : 0, + }; } +} - return fileHashes; +/** + * Build file hash entries for every collected file, including those that + * produced zero symbols (empty files, parsers that silently no-op'd, or + * optional-language extensions whose grammar wasn't installed). Iterating the + * symbol map instead would skip such files and leave them missing from + * `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on + * any subsequent no-op rebuild (#1068). + * + * Exported for unit testing. + */ +export function buildFileHashes( + filesToParse: FileToParse[], + precomputedData: Map, + metadataUpdates: MetadataUpdate[], + rootDir: string, +): FileHashRecord[] { + return [ + ...iterFileHashRecords( + filesToParse, + precomputedData, + metadataUpdates, + rootDir, + 'buildFileHashes', + ), + ]; } // ── Native fast-path ───────────────────────────────────────────────── @@ -260,36 +312,38 @@ function insertDefinitionsAndExports( // ── JS fallback: Phase 2+3 ────────────────────────────────────────── -function insertChildrenAndEdges( +/** Build the in-memory `name|kind|line` → node-id map for a single file. */ +function loadFileNodeIdMap(db: BetterSqlite3Database, relPath: string): Map { + const map = new Map(); + for (const row of bulkNodeIdsByFile(db, relPath)) { + map.set(`${row.name}|${row.kind}|${row.line}`, row.id); + } + return map; +} + +/** + * First pass: for every file, emit file→def containment edges and collect + * the child-node insertion rows. + */ +function collectChildRowsAndFileEdges( db: BetterSqlite3Database, allSymbols: Map, + childRows: unknown[][], + edgeRows: unknown[][], ): void { - const childRows: unknown[][] = []; - const edgeRows: unknown[][] = []; - for (const [relPath, symbols] of allSymbols) { - // First pass: collect file→def edges and child rows - const nodeIdMap = new Map(); - for (const row of bulkNodeIdsByFile(db, relPath)) { - nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); - } - + const nodeIdMap = loadFileNodeIdMap(db, relPath); const fileId = nodeIdMap.get(`${relPath}|file|0`); for (const def of symbols.definitions) { const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); - // Containment edge: file -> definition if (fileId && defId) { edgeRows.push([fileId, defId, 'contains', 1.0, 0]); } - - if (!def.children?.length) continue; - if (!defId) continue; + if (!def.children?.length || !defId) continue; for (const child of def.children) { - // Child node - const qualifiedName = `${def.name}.${child.name}`; childRows.push([ child.name, child.kind, @@ -297,39 +351,55 @@ function insertChildrenAndEdges( child.line, child.endLine || null, defId, - qualifiedName, + `${def.name}.${child.name}`, def.name, child.visibility || null, ]); } } } +} - // Insert children first (so they exist for edge lookup) - batchInsertNodes(db, childRows); - - // Now re-fetch IDs to include newly-inserted children, then add child edges +/** + * Second pass (after child nodes have been inserted): emit def→child + * containment edges and child→def `parameter_of` edges. + */ +function collectChildEdges( + db: BetterSqlite3Database, + allSymbols: Map, + edgeRows: unknown[][], +): void { for (const [relPath, symbols] of allSymbols) { - const nodeIdMap = new Map(); - for (const row of bulkNodeIdsByFile(db, relPath)) { - nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); - } + const nodeIdMap = loadFileNodeIdMap(db, relPath); for (const def of symbols.definitions) { if (!def.children?.length) continue; const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); if (!defId) continue; for (const child of def.children) { const childId = nodeIdMap.get(`${child.name}|${child.kind}|${child.line}`); - if (childId) { - edgeRows.push([defId, childId, 'contains', 1.0, 0]); - if (child.kind === 'parameter') { - edgeRows.push([childId, defId, 'parameter_of', 1.0, 0]); - } + if (!childId) continue; + edgeRows.push([defId, childId, 'contains', 1.0, 0]); + if (child.kind === 'parameter') { + edgeRows.push([childId, defId, 'parameter_of', 1.0, 0]); } } } } +} + +function insertChildrenAndEdges( + db: BetterSqlite3Database, + allSymbols: Map, +): void { + const childRows: unknown[][] = []; + const edgeRows: unknown[][] = []; + collectChildRowsAndFileEdges(db, allSymbols, childRows, edgeRows); + + // Insert children first (so they exist for edge lookup) + batchInsertNodes(db, childRows); + + collectChildEdges(db, allSymbols, edgeRows); batchInsertEdges(db, edgeRows); } @@ -348,50 +418,14 @@ function updateFileHashes( // Iterate every collected file (#1068): files that produced zero symbols // (empty, parser no-op, or grammar-missing optional language) still need a // hash row, otherwise the next no-op rebuild's fast-skip pre-flight rejects. - const seen = new Set(); - for (const item of filesToParse) { - const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file)); - if (seen.has(relPath)) continue; - seen.add(relPath); - - const precomputed = precomputedData.get(relPath); - if (precomputed?._reverseDepOnly) { - // no-op: file unchanged, hash already correct - } else if (precomputed?.hash) { - let mtime: number; - let size: number; - if (precomputed.stat) { - mtime = precomputed.stat.mtime; - size = precomputed.stat.size; - } else { - const rawStat = fileStat(path.join(rootDir, relPath)); - mtime = rawStat ? rawStat.mtime : 0; - size = rawStat ? rawStat.size : 0; - } - upsertHash.run(relPath, precomputed.hash, mtime, size); - } else { - const absPath = path.join(rootDir, relPath); - let code: string | null; - try { - code = readFileSafe(absPath); - } catch (e) { - debug(`updateFileHashes: readFileSafe failed for ${relPath}: ${toErrorMessage(e)}`); - code = null; - } - if (code !== null) { - const stat = fileStat(absPath); - const mtime = stat ? stat.mtime : 0; - const size = stat ? stat.size : 0; - upsertHash.run(relPath, fileHash(code), mtime, size); - } - } - } - - // Also update metadata-only entries (self-heal mtime/size without re-parse) - for (const item of metadataUpdates) { - const mtime = item.stat ? item.stat.mtime : 0; - const size = item.stat ? item.stat.size : 0; - upsertHash.run(item.relPath, item.hash, mtime, size); + for (const record of iterFileHashRecords( + filesToParse, + precomputedData, + metadataUpdates, + rootDir, + 'updateFileHashes', + )) { + upsertHash.run(record.file, record.hash, record.mtime, record.size); } } diff --git a/src/domain/graph/builder/stages/native-db-lifecycle.ts b/src/domain/graph/builder/stages/native-db-lifecycle.ts new file mode 100644 index 000000000..ac9e2568f --- /dev/null +++ b/src/domain/graph/builder/stages/native-db-lifecycle.ts @@ -0,0 +1,74 @@ +/** + * NativeDatabase connection lifecycle helpers. + * + * The Rust orchestrator and the JS pipeline stages both juggle the same + * `nativeDb` handle (rusqlite) alongside `ctx.db` (better-sqlite3). These + * helpers centralise the open/close/reopen sequence so both call sites + * preserve the same WAL-safety invariants: + * + * - Always checkpoint WAL before closing rusqlite — otherwise better-sqlite3's + * internal WAL index can drift and surface as SQLITE_CORRUPT on the next + * read (#715, #736). + * - Always reopen better-sqlite3 after rusqlite writes to drop the stale + * page cache. + * + * Lives in its own module so `tryNativeOrchestrator` (in `native-orchestrator.ts`) + * and the JS pipeline stages driver (in `pipeline.ts`) can share the helpers + * without either file importing the other. + */ +import { openDb } from '../../../../db/index.js'; +import { debug } from '../../../../infrastructure/logger.js'; +import { loadNative } from '../../../../infrastructure/native.js'; +import { toErrorMessage } from '../../../../shared/errors.js'; +import type { PipelineContext } from '../context.js'; + +/** Checkpoint WAL through rusqlite and close the native connection. */ +export function closeNativeDb(ctx: PipelineContext, label: string): void { + if (!ctx.nativeDb) return; + try { + ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); + } catch (e) { + debug(`${label} WAL checkpoint failed: ${toErrorMessage(e)}`); + } + try { + ctx.nativeDb.close(); + } catch (e) { + debug(`${label} nativeDb close failed: ${toErrorMessage(e)}`); + } + ctx.nativeDb = undefined; +} + +/** Try to reopen the native connection for a given pipeline phase. */ +export function reopenNativeDb(ctx: PipelineContext, label: string): void { + if ((ctx.opts.engine ?? 'auto') === 'wasm') return; + const native = loadNative(); + if (!native?.NativeDatabase) return; + try { + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + } catch (e) { + debug(`reopen nativeDb for ${label} failed: ${toErrorMessage(e)}`); + ctx.nativeDb = undefined; + } +} + +/** Close nativeDb and clear stale references in engineOpts. */ +export function suspendNativeDb(ctx: PipelineContext, label: string): void { + closeNativeDb(ctx, label); + if (ctx.engineOpts?.nativeDb) { + ctx.engineOpts.nativeDb = undefined; + } +} + +/** + * After native writes, reopen the JS db connection to get a fresh page cache. + * Rusqlite WAL truncation invalidates better-sqlite3's internal WAL index, + * causing SQLITE_CORRUPT on the next read (#715, #736). + */ +export function refreshJsDb(ctx: PipelineContext): void { + try { + ctx.db.close(); + } catch (e) { + debug(`refreshJsDb close failed: ${toErrorMessage(e)}`); + } + ctx.db = openDb(ctx.dbPath); +} diff --git a/src/domain/graph/builder/stages/native-orchestrator.ts b/src/domain/graph/builder/stages/native-orchestrator.ts new file mode 100644 index 000000000..934dd8d05 --- /dev/null +++ b/src/domain/graph/builder/stages/native-orchestrator.ts @@ -0,0 +1,942 @@ +/** + * Native build orchestrator stage — runs the full Rust pipeline when available, + * with WASM fallback for files the native engine drops. + * + * Extracted from `pipeline.ts` to break the name-collision cycle between + * `buildGraph()` (this module's caller) and `ctx.nativeDb.buildGraph()` (the + * Rust orchestrator entry point invoked here). Codegraph's name-based call + * resolver previously conflated the two and reported a false-positive + * function-level cycle (`buildGraph ↔ tryNativeOrchestrator`). + * + * The orchestrator-selection strategy lives here so `pipeline.ts` stays a thin + * top-level controller: detect changes, try native, fall back to JS stages. + */ +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { + acquireAdvisoryLock, + closeDbPair, + openDb, + purgeFilesData, + releaseAdvisoryLock, + setBuildMeta, +} from '../../../../db/index.js'; +import { debug, info, warn } from '../../../../infrastructure/logger.js'; +import { loadNative } from '../../../../infrastructure/native.js'; +import { semverCompare } from '../../../../infrastructure/update-check.js'; +import { normalizePath } from '../../../../shared/constants.js'; +import { toErrorMessage } from '../../../../shared/errors.js'; +import { CODEGRAPH_VERSION } from '../../../../shared/version.js'; +import type { + BetterSqlite3Database, + BuildResult, + Definition, + ExtractorOutput, + SqliteStatement, +} from '../../../../types.js'; +import { + classifyNativeDrops, + formatDropExtensionSummary, + getInstalledWasmExtensions, + NATIVE_SUPPORTED_EXTENSIONS, + parseFilesWasmForBackfill, +} from '../../../parser.js'; +import type { PipelineContext } from '../context.js'; +import { + batchInsertNodes, + collectFiles as collectFilesUtil, + fileHash, + fileStat, + readFileSafe, +} from '../helpers.js'; +import { NativeDbProxy } from '../native-db-proxy.js'; +import { closeNativeDb } from './native-db-lifecycle.js'; + +// ── Native orchestrator types ────────────────────────────────────────── + +interface NativeOrchestratorResult { + phases: Record; + earlyExit?: boolean; + nodeCount?: number; + edgeCount?: number; + fileCount?: number; + changedFiles?: string[]; + changedCount?: number; + removedCount?: number; + isFullBuild?: boolean; + /** Whether the Rust pipeline handled the structure phase (small-incremental fast path). */ + structureHandled?: boolean; + /** Whether the Rust pipeline wrote AST/complexity/CFG/dataflow to DB. */ + analysisComplete?: boolean; +} + +/** Files the native orchestrator silently dropped — the working set for backfill. */ +interface DroppedLanguageGap { + /** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */ + missingRel: string[]; + /** Absolute paths, aligned by index with `missingRel`. */ + missingAbs: string[]; + /** + * Relative paths of WASM-only files present in DB but absent from disk (#1073). + * Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side + * backfill must purge them. Always disjoint from `missingRel`. + */ + staleRel: string[]; +} + +/** + * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper + * is pure and unit-testable independently of `getInstalledWasmExtensions` and + * the `NATIVE_SUPPORTED_EXTENSIONS` global state. + */ +export interface WasmOnlyStaleFilesInput { + /** Distinct `file` values from the `nodes` table. */ + existingNodes: ReadonlySet; + /** Distinct `file` values from the `file_hashes` table. */ + existingHashes: ReadonlySet; + /** Relative paths currently on disk (from `collectFilesUtil`). */ + expected: ReadonlySet; + /** Lowercased extensions whose WASM grammar is installed. */ + installedExts: ReadonlySet; + /** Extensions covered by the Rust addon — Rust owns deletion for these. */ + nativeSupported: ReadonlySet; +} + +// ── Native orchestrator helpers ─────────────────────────────────────── + +/** Determine whether the native orchestrator should be skipped. Returns a reason string, or null if it should run. */ +function shouldSkipNativeOrchestrator(ctx: PipelineContext): string | null { + if (ctx.forceFullRebuild) return 'forceFullRebuild'; + // v3.9.0 addon had buggy incremental purge (wrong SQL on analysis tables, + // scoped removal over-detection). Fixed in v3.9.1 by PR #865. Gate on + // < 3.9.1 so v3.9.1+ uses the fast Rust orchestrator path. + const orchestratorBuggy = !!ctx.engineVersion && semverCompare(ctx.engineVersion, '3.9.1') < 0; + if (orchestratorBuggy) return `buggy addon ${ctx.engineVersion}`; + if (ctx.engineName !== 'native') return `engine=${ctx.engineName}`; + return null; +} + +/** Checkpoint WAL through rusqlite, close nativeDb, and reopen better-sqlite3. + * Returns false if the DB reopen fails (caller should return partial result). */ +function handoffWalAfterNativeBuild(ctx: PipelineContext): boolean { + closeNativeDb(ctx, 'post-native-build'); + try { + ctx.db.close(); + } catch (e) { + debug(`handoffWal JS db close failed: ${toErrorMessage(e)}`); + } + try { + ctx.db = openDb(ctx.dbPath); + return true; + } catch (reopenErr) { + warn(`Failed to reopen DB after native build: ${(reopenErr as Error).message}`); + return false; + } +} + +/** + * Reconstruct fileSymbols from the DB after a native orchestrator build. + * When `scopeFiles` is provided, only loads those files (for analysis-only). + * When omitted, loads all files (needed for structure rebuilds). + */ +function reconstructFileSymbolsFromDb( + ctx: PipelineContext, + scopeFiles?: string[], +): Map { + let query = + 'SELECT file, name, kind, line, end_line as endLine FROM nodes WHERE file IS NOT NULL'; + const params: string[] = []; + if (scopeFiles && scopeFiles.length > 0) { + const placeholders = scopeFiles.map(() => '?').join(','); + query += ` AND file IN (${placeholders})`; + params.push(...scopeFiles); + } + query += ' ORDER BY file, line'; + + const rows = ctx.db.prepare(query).all(...params) as { + file: string; + name: string; + kind: string; + line: number; + endLine: number | null; + }[]; + + const fileSymbols = new Map(); + for (const row of rows) { + let entry = fileSymbols.get(row.file); + if (!entry) { + entry = { + definitions: [], + calls: [], + imports: [], + classes: [], + exports: [], + typeMap: new Map(), + }; + fileSymbols.set(row.file, entry); + } + entry.definitions.push({ + name: row.name, + kind: row.kind as Definition['kind'], + line: row.line, + endLine: row.endLine ?? undefined, + }); + } + + // Populate import/export counts from DB edges so buildStructure + // computes correct import_count/export_count in node_metrics. + // The extractor arrays aren't persisted to the DB, so we derive + // counts from edge data instead (#804). + const importCountRows = ctx.db + .prepare( + `SELECT n.file, COUNT(*) AS cnt + FROM edges e JOIN nodes n ON e.source_id = n.id + WHERE e.kind IN ('imports', 'imports-type', 'dynamic-imports') + AND n.file IS NOT NULL + GROUP BY n.file`, + ) + .all() as { file: string; cnt: number }[]; + for (const row of importCountRows) { + const entry = fileSymbols.get(row.file); + if (entry) entry.imports = new Array(row.cnt) as ExtractorOutput['imports']; + } + + const exportCountRows = ctx.db + .prepare( + `SELECT n_tgt.file, COUNT(DISTINCT n_tgt.id) AS cnt + FROM edges e + JOIN nodes n_tgt ON e.target_id = n_tgt.id + JOIN nodes n_src ON e.source_id = n_src.id + WHERE e.kind IN ('imports', 'imports-type', 'reexports') + AND n_tgt.file IS NOT NULL + AND n_src.file != n_tgt.file + GROUP BY n_tgt.file`, + ) + .all() as { file: string; cnt: number }[]; + for (const row of exportCountRows) { + const entry = fileSymbols.get(row.file); + if (entry) entry.exports = new Array(row.cnt) as ExtractorOutput['exports']; + } + + return fileSymbols; +} + +/** + * Run JS buildStructure() after native orchestrator to fill directory nodes + contains edges. + * For full builds, passes changedFiles=null (full rebuild). + * For incremental builds, passes the changed file list to scope the update. + */ +async function runPostNativeStructure( + ctx: PipelineContext, + allFileSymbols: Map, + isFullBuild: boolean, + changedFiles: string[] | undefined, +): Promise { + const structureStart = performance.now(); + try { + const directories = new Set(); + for (const relPath of allFileSymbols.keys()) { + const parts = relPath.split('/'); + for (let i = 1; i < parts.length; i++) { + directories.add(parts.slice(0, i).join('/')); + } + } + + const lineCountMap = new Map(); + const cachedLineCounts = ctx.db + .prepare( + `SELECT n.name AS file, m.line_count + FROM node_metrics m JOIN nodes n ON m.node_id = n.id + WHERE n.kind = 'file'`, + ) + .all() as Array<{ file: string; line_count: number }>; + for (const row of cachedLineCounts) { + lineCountMap.set(row.file, row.line_count); + } + + // Full builds need null (rebuild everything). Incremental builds pass the + // changed file list so buildStructure only updates those files' metrics + // and contains edges — matching the JS pipeline's medium-incremental path. + const changedFilePaths = isFullBuild || !changedFiles?.length ? null : changedFiles; + const { buildStructure: buildStructureFn } = (await import( + '../../../../features/structure.js' + )) as { + buildStructure: ( + db: typeof ctx.db, + fileSymbols: Map, + rootDir: string, + lineCountMap: Map, + directories: Set, + changedFiles: string[] | null, + ) => void; + }; + buildStructureFn( + ctx.db, + allFileSymbols, + ctx.rootDir, + lineCountMap, + directories, + changedFilePaths, + ); + debug( + `Structure phase completed after native orchestrator${changedFilePaths ? ` (${changedFilePaths.length} files)` : ' (full)'}`, + ); + } catch (err) { + warn(`Structure phase failed after native build: ${toErrorMessage(err)}`); + } + return performance.now() - structureStart; +} + +/** + * JS fallback for AST/complexity/CFG/dataflow analysis after native orchestrator. + * Used when the Rust addon doesn't include analysis persistence (older addon + * version) or when analysis failed on the Rust side. + */ +async function runPostNativeAnalysis( + ctx: PipelineContext, + allFileSymbols: Map, + changedFiles: string[] | undefined, +): Promise<{ astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }> { + const timing = { astMs: 0, complexityMs: 0, cfgMs: 0, dataflowMs: 0 }; + + // Scope analysis fileSymbols to changed files only + let analysisFileSymbols: Map; + if (changedFiles && changedFiles.length > 0) { + analysisFileSymbols = new Map(); + for (const f of changedFiles) { + const entry = allFileSymbols.get(f); + if (entry) analysisFileSymbols.set(f, entry); + } + } else { + analysisFileSymbols = allFileSymbols; + } + + // Reopen nativeDb for analysis features (suspend/resume WAL pattern). + const native = loadNative(); + if (native?.NativeDatabase) { + try { + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + if (ctx.engineOpts) ctx.engineOpts.nativeDb = ctx.nativeDb; + } catch { + ctx.nativeDb = undefined; + if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; + } + } + + // Flush JS WAL pages once so Rust can see them, then no-op callbacks. + // Previously each feature called wal_checkpoint(TRUNCATE) individually + // (~68ms each × 3-4 features). One FULL checkpoint suffices. + if (ctx.nativeDb && ctx.engineOpts) { + ctx.db.pragma('wal_checkpoint(FULL)'); + ctx.engineOpts.suspendJsDb = () => {}; + ctx.engineOpts.resumeJsDb = () => {}; + } + + try { + const { runAnalyses: runAnalysesFn } = (await import('../../../../ast-analysis/engine.js')) as { + runAnalyses: ( + db: BetterSqlite3Database, + fileSymbols: Map, + rootDir: string, + opts: Record, + engineOpts?: Record, + ) => Promise<{ astMs?: number; complexityMs?: number; cfgMs?: number; dataflowMs?: number }>; + }; + const result = await runAnalysesFn( + ctx.db, + analysisFileSymbols, + ctx.rootDir, + ctx.opts as Record, + ctx.engineOpts as unknown as Record | undefined, + ); + timing.astMs = result.astMs ?? 0; + timing.complexityMs = result.complexityMs ?? 0; + timing.cfgMs = result.cfgMs ?? 0; + timing.dataflowMs = result.dataflowMs ?? 0; + } catch (err) { + warn(`Analysis phases failed after native build: ${toErrorMessage(err)}`); + } + + // Close nativeDb after analyses — TRUNCATE checkpoint flushes all Rust + // WAL writes so JS and external readers can see them. Runs once after + // all analysis features complete (not per-feature). + if (ctx.nativeDb) { + try { + ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); + } catch { + /* ignore checkpoint errors */ + } + try { + ctx.nativeDb.close(); + } catch { + /* ignore close errors */ + } + ctx.nativeDb = undefined; + if (ctx.engineOpts) { + ctx.engineOpts.nativeDb = undefined; + ctx.engineOpts.suspendJsDb = undefined; + ctx.engineOpts.resumeJsDb = undefined; + } + } + + return timing; +} + +/** Format timing result from native orchestrator phases + JS post-processing. */ +function formatNativeTimingResult( + p: Record, + structurePatchMs: number, + analysisTiming: { astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }, +): BuildResult { + return { + phases: { + setupMs: +(p.setupMs ?? 0).toFixed(1), + collectMs: +(p.collectMs ?? 0).toFixed(1), + detectMs: +(p.detectMs ?? 0).toFixed(1), + parseMs: +(p.parseMs ?? 0).toFixed(1), + insertMs: +(p.insertMs ?? 0).toFixed(1), + resolveMs: +(p.resolveMs ?? 0).toFixed(1), + edgesMs: +(p.edgesMs ?? 0).toFixed(1), + structureMs: +((p.structureMs ?? 0) + structurePatchMs).toFixed(1), + rolesMs: +(p.rolesMs ?? 0).toFixed(1), + astMs: +(analysisTiming.astMs ?? 0).toFixed(1), + complexityMs: +(analysisTiming.complexityMs ?? 0).toFixed(1), + cfgMs: +(analysisTiming.cfgMs ?? 0).toFixed(1), + dataflowMs: +(analysisTiming.dataflowMs ?? 0).toFixed(1), + finalizeMs: +(p.finalizeMs ?? 0).toFixed(1), + }, + }; +} + +/** + * Compute the WASM-only files present in the DB but missing from disk (#1073). + * + * Returns relative paths that: + * - appear in `existingNodes` or `existingHashes` (in DB), + * - are absent from `expected` (not on disk), + * - have an extension installed for WASM, AND + * - have an extension NOT covered by `nativeSupported` — Rust's + * `purge_changed_files` handles deletion for natively-supported extensions + * via its own `detect_removed_files`, so the caller must not double-purge. + * + * Extensions are lowercased before lookup to match the registry and Rust's + * `LanguageKind::from_extension` (which normalises case for the languages + * where both cases are conventional, e.g. R's `.r` / `.R`). + * + * DB paths are forced to forward slashes before comparison with `expected` + * (which is always normalised). The on-disk invariant is that DB rows are + * written with forward slashes, but a stale row written by older code on + * Windows could carry back-slashes — normalising here makes the comparison + * platform-safe and prevents false-positive purges of live rows. We replace + * `\\` explicitly (rather than calling `normalizePath`, which only touches + * `path.sep`) so the defence works when running on POSIX against a DB that + * was migrated from Windows. + * + * Exported for unit testing. + */ +export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { + const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; + const stale: string[] = []; + const seen = new Set(); + const consider = (rawRel: string): void => { + const rel = rawRel.replace(/\\/g, '/'); + if (expected.has(rel) || seen.has(rel)) return; + const ext = path.extname(rel).toLowerCase(); + if (nativeSupported.has(ext)) return; + if (!installedExts.has(ext)) return; + seen.add(rel); + // Push the ORIGINAL raw path (not the normalised form) so the eventual + // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData` + // matches the actual stored row. The dedup `seen` set keeps the + // normalised form so a file written once with `\` and once with `/` + // is still treated as one entry — but the value the SQL sees has to + // be byte-identical to what's on disk in the DB. + stale.push(rawRel); + }; + for (const rel of existingNodes) consider(rel); + for (const rel of existingHashes) consider(rel); + return stale; +} + +/** + * Group relative paths by their lowercased extension. Shape matches the bucket + * type that `formatDropExtensionSummary` consumes, so callers can render a + * log-friendly per-extension summary without going through `classifyNativeDrops` + * when the reason is already known (e.g. the stale-purge path where every path + * is guaranteed `unsupported-by-native`). + */ +function groupByExtension(relPaths: Iterable): Map { + const buckets = new Map(); + for (const rel of relPaths) { + const ext = path.extname(rel).toLowerCase(); + let list = buckets.get(ext); + if (!list) { + list = []; + buckets.set(ext, list); + } + list.push(rel); + } + return buckets; +} + +/** + * Detect files the native orchestrator silently dropped. + * + * Walks the filesystem and compares against `nodes` + `file_hashes`. A file + * is "missing" if it's absent from EITHER table — both must be present for + * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g. + * legacy DBs where `nodes` was populated but `file_hashes` was not). + * + * Restricted to files with an installed WASM grammar; extensions in + * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal + * installs) can't be parsed by either engine, so they're not a native + * regression — excluding them keeps the warn count in + * `backfillNativeDroppedFiles` meaningful. + * + * Also detects WASM-only files deleted from disk (#1073). Rust's + * `detect_removed_files` filter (#1070) skips files outside its supported + * extensions, so deletions of WASM-only languages don't reach the native + * purge path; the rest of the backfill only inserts rows, so without this + * step stale `nodes`/`file_hashes` rows would linger across incremental + * rebuilds until the next full rebuild. + * + * Cheap (no DB handoff, no parsing): used both to gate the backfill call + * and as its working set. NativeDbProxy supports `.prepare().all()`, so + * this works whether `ctx.db` is a proxy or a real better-sqlite3 + * connection — letting us skip the close-native / reopen-better-sqlite3 + * cost when there's nothing to backfill. + */ +function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { + const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); + const expected = new Set( + collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), + ); + + const existingNodeRows = ctx.db + .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'") + .all() as Array<{ file: string }>; + const existingNodes = new Set(existingNodeRows.map((r) => r.file)); + + let existingHashes = new Set(); + try { + const existingHashRows = ctx.db + .prepare('SELECT DISTINCT file FROM file_hashes') + .all() as Array<{ file: string }>; + existingHashes = new Set(existingHashRows.map((r) => r.file)); + } catch (e) { + // file_hashes table may not exist on legacy DBs; treat as fully missing + // so the backfill writes rows on the upsert path below. + debug( + `detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, + ); + } + + const installedExts = getInstalledWasmExtensions(); + const missingRel: string[] = []; + const missingAbs: string[] = []; + for (const rel of expected) { + if (existingNodes.has(rel) && existingHashes.has(rel)) continue; + const ext = path.extname(rel).toLowerCase(); + if (!installedExts.has(ext)) continue; + missingRel.push(rel); + missingAbs.push(path.join(ctx.rootDir, rel)); + } + + const staleRel = computeWasmOnlyStaleFiles({ + existingNodes, + existingHashes, + expected, + installedExts, + nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, + }); + + return { missingRel, missingAbs, staleRel }; +} + +/** + * Backfill files that the native orchestrator silently dropped during parse. + * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). + * + * Also purges stale rows for WASM-only files deleted from disk (#1073), which + * Rust's `detect_removed_files` filter (#1070) skips. + * + * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller + * can use the same scan for both gating and the actual backfill — avoiding + * a redundant fs walk when the orchestrator's signals already triggered. + */ +async function backfillNativeDroppedFiles( + ctx: PipelineContext, + gap: DroppedLanguageGap, +): Promise { + const { missingRel, missingAbs, staleRel } = gap; + if (missingAbs.length === 0 && staleRel.length === 0) return; + + // Now that we know there's work to do, hand off to better-sqlite3 (needed + // for the INSERT path below). + if (ctx.nativeFirstProxy) { + closeNativeDb(ctx, 'pre-parity-backfill'); + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } + + const dbConn = ctx.db as unknown as BetterSqlite3Database; + + // Purge WASM-only files that were deleted from disk (#1073). Rust's + // detect_removed_files skips them and the insert path below never visits + // them, so without this their rows would persist across rebuilds until the + // next full rebuild reset the DB. + if (staleRel.length > 0) { + // `computeWasmOnlyStaleFiles` guarantees every path here has an extension + // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would + // always bucket 100% into `unsupported-by-native`. Build the extension + // summary directly to avoid a redundant classification pass. + const staleByExt = groupByExtension(staleRel); + info( + `Detected ${staleRel.length} deleted WASM-only file(s) across ${staleByExt.size} extension(s) the native orchestrator skipped; purging stale rows:${formatDropExtensionSummary(staleByExt)}`, + ); + purgeFilesData(dbConn, staleRel); + } + + if (missingAbs.length === 0) return; + + // Classify drops so users see per-extension reasons instead of just a count + // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust + // extractor); `native-extractor-failure` indicates a real native bug since + // the language IS supported by the addon yet the file was dropped anyway. + const { byReason, totals } = classifyNativeDrops(missingRel); + if (totals['unsupported-by-native'] > 0) { + const buckets = byReason['unsupported-by-native']; + info( + `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) across ${buckets.size} extension(s) in languages without a Rust extractor; backfilling via WASM:${formatDropExtensionSummary(buckets)}`, + ); + } + if (totals['native-extractor-failure'] > 0) { + const buckets = byReason['native-extractor-failure']; + warn( + `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) across ${buckets.size} extension(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM:${formatDropExtensionSummary(buckets)}`, + ); + } + const wasmResults = await parseFilesWasmForBackfill(missingAbs, ctx.rootDir); + + const rows: unknown[][] = []; + const exportKeys: unknown[][] = []; + for (const [relPath, symbols] of wasmResults) { + // File row — mirrors insertDefinitionsAndExports: qualified_name is null. + rows.push([relPath, 'file', relPath, 0, null, null, null, null, null]); + for (const def of symbols.definitions ?? []) { + // Populate qualified_name/scope the same way the JS fallback does so + // downstream queries (cross-file references, "go to definition") find + // these symbols. + const dotIdx = def.name.lastIndexOf('.'); + const scope = dotIdx !== -1 ? def.name.slice(0, dotIdx) : null; + rows.push([ + def.name, + def.kind, + relPath, + def.line, + def.endLine ?? null, + null, + def.name, + scope, + def.visibility ?? null, + ]); + } + // Exports: insert the row (INSERT OR IGNORE — a matching definition row + // is a no-op) and queue a key for the second-pass exported=1 update, so + // queries filtering on exported=1 find backfilled symbols (#970). + for (const exp of symbols.exports ?? []) { + rows.push([exp.name, exp.kind, relPath, exp.line, null, null, exp.name, null, null]); + exportKeys.push([exp.name, exp.kind, relPath, exp.line]); + } + } + const db = dbConn; + batchInsertNodes(db, rows); + + // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. + if (exportKeys.length > 0) { + const EXPORT_CHUNK = 500; + const exportStmtCache = new Map(); + for (let i = 0; i < exportKeys.length; i += EXPORT_CHUNK) { + const end = Math.min(i + EXPORT_CHUNK, exportKeys.length); + const chunkSize = end - i; + let updateStmt = exportStmtCache.get(chunkSize); + if (!updateStmt) { + const conditions = Array.from( + { length: chunkSize }, + () => '(name = ? AND kind = ? AND file = ? AND line = ?)', + ).join(' OR '); + updateStmt = db.prepare(`UPDATE nodes SET exported = 1 WHERE ${conditions}`); + exportStmtCache.set(chunkSize, updateStmt); + } + const vals: unknown[] = []; + for (let j = i; j < end; j++) { + const k = exportKeys[j] as unknown[]; + vals.push(k[0], k[1], k[2], k[3]); + } + updateStmt.run(...vals); + } + } + + // Persist file_hashes rows for every backfilled file. The Rust orchestrator + // only hashes files it parsed itself, so without this step files in + // optional-language extensions (e.g. .clj when no Rust extractor exists) + // would be missing from `file_hashes` — permanently breaking the JS-side + // fast-skip pre-flight (#1054), which rejects on `collected file missing + // from file_hashes` and forces every no-op rebuild back through the full + // ~2s native pipeline (#1068). + // + // Iterates `missingRel` (every collected file the Rust orchestrator + // dropped), not `wasmResults`, so files that produced zero symbols still + // get a row. + try { + const upsertHash = db.prepare( + 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', + ); + const writeHashes = db.transaction(() => { + for (let i = 0; i < missingRel.length; i++) { + const relPath = missingRel[i]; + const absPath = missingAbs[i]; + if (!relPath || !absPath) continue; + let code: string | null; + try { + code = readFileSafe(absPath); + } catch (e) { + debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`); + continue; + } + if (code === null) continue; + const stat = fileStat(absPath); + const mtime = stat ? stat.mtime : 0; + const size = stat ? stat.size : 0; + upsertHash.run(relPath, fileHash(code), mtime, size); + } + }); + writeHashes(); + } catch (e) { + debug( + `backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`, + ); + } + + // Free WASM parse trees from the inline backfill path (#1058). + // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree + // backed by WASM linear memory) on every result, but these symbols are + // consumed locally for DB row construction and never added to + // `ctx.allSymbols`, so the finalize-stage `releaseWasmTrees` sweep never + // sees them. Without this, trees leak WASM memory until process exit — + // bounded per run but cumulative across in-process integration tests. + // Mirrors the cleanup discipline established for #931. + for (const [, symbols] of wasmResults) { + const tree = (symbols as { _tree?: { delete?: () => void } })._tree; + if (tree && typeof tree.delete === 'function') { + try { + tree.delete(); + } catch { + /* ignore cleanup errors */ + } + } + (symbols as { _tree?: unknown; _langId?: unknown })._tree = undefined; + (symbols as { _tree?: unknown; _langId?: unknown })._langId = undefined; + } +} + +/** + * Try the native build orchestrator. + * + * Returns: + * - `BuildResult` on success (caller should return it directly). + * - `'early-exit'` when the orchestrator detected no changes (caller should return undefined). + * - `undefined` when native is unavailable or skipped (caller should fall through to the JS pipeline). + * + * Encapsulates the orchestrator-selection strategy: open `NativeDatabase`, + * invoke `nativeDb.buildGraph()` (the Rust pipeline), and run post-native + * structure + analysis fallbacks. Lives in its own file to keep the Rust + * orchestrator entry point separated from the JS-side `buildGraph()` driver + * in `pipeline.ts`. + */ +export async function tryNativeOrchestrator( + ctx: PipelineContext, +): Promise { + const skipReason = shouldSkipNativeOrchestrator(ctx); + if (skipReason) { + debug(`Skipping native orchestrator: ${skipReason}`); + return undefined; + } + + // Open NativeDatabase on demand — deferred from setupPipeline to skip the + // ~60ms cost on no-op/early-exit builds. Close the better-sqlite3 connection + // first to avoid dual-connection WAL corruption. + if (!ctx.nativeDb && ctx.nativeAvailable) { + const native = loadNative(); + if (native?.NativeDatabase) { + try { + // Close better-sqlite3 before opening rusqlite to avoid WAL conflicts. + // Uses raw close() instead of closeDb() intentionally — the advisory lock + // is kept and transferred to the NativeDbProxy below, not released here. + ctx.db.close(); + acquireAdvisoryLock(ctx.dbPath); + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + ctx.nativeDb.initSchema(); + // Replace ctx.db with a NativeDbProxy so post-native JS fallback + // (structure, analysis) can use it without reopening better-sqlite3. + const proxy = new NativeDbProxy(ctx.nativeDb); + proxy.__lockPath = `${ctx.dbPath}.lock`; + ctx.db = proxy as unknown as typeof ctx.db; + ctx.nativeFirstProxy = true; + } catch (err) { + warn(`NativeDatabase setup failed, falling back to JS: ${toErrorMessage(err)}`); + try { + ctx.nativeDb?.close(); + } catch (e) { + debug(`tryNativeOrchestrator: close failed during fallback: ${toErrorMessage(e)}`); + } + ctx.nativeDb = undefined; + ctx.nativeFirstProxy = false; // defensive: reset in case future refactors move the assignment above throwing lines + releaseAdvisoryLock(`${ctx.dbPath}.lock`); + // Reopen better-sqlite3 for JS pipeline fallback + ctx.db = openDb(ctx.dbPath); + } + } + } + + if (!ctx.nativeDb?.buildGraph) return undefined; + + const resultJson = ctx.nativeDb.buildGraph( + ctx.rootDir, + JSON.stringify(ctx.config), + JSON.stringify(ctx.aliases), + JSON.stringify(ctx.opts), + ); + const result = JSON.parse(resultJson) as NativeOrchestratorResult; + + if (result.earlyExit) { + info('No changes detected'); + // Even on no-op rebuilds, dropped-language files added since the last + // full build are still missing from `nodes`/`file_hashes` (#1083), and + // WASM-only files deleted from disk leave stale rows behind (#1073). + // The orchestrator's file_collector skipped them, so its earlyExit + // doesn't imply DB consistency. Run the gap repair before returning. + const gap = detectDroppedLanguageGap(ctx); + if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) { + await backfillNativeDroppedFiles(ctx, gap); + } + closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); + return 'early-exit'; + } + + // Log incremental status to match JS pipeline output + const changed = result.changedCount ?? 0; + const removed = result.removedCount ?? 0; + if (!result.isFullBuild && (changed > 0 || removed > 0)) { + info(`Incremental: ${changed} changed, ${removed} removed`); + } + + const p = result.phases; + + // Sync build_meta so JS-side version/engine checks work on next build. + // Use the binary's CARGO_PKG_VERSION (ctx.nativeBinaryVersion), not the + // platform package.json version (ctx.engineVersion). The Rust side's + // check_version_mismatch compares against CARGO_PKG_VERSION; writing + // the package.json value would create a permanent mismatch whenever + // the binary and platform package.json diverge — e.g., CI hot-swap + // via ci-install-native.mjs (#1066) — forcing every subsequent build + // to be a full rebuild. + // + // When the native addon doesn't expose engineVersion() (older addon), + // fall back to CODEGRAPH_VERSION — same fallback used by both + // checkEngineSchemaMismatch (read path) and persistBuildMetadata + // (the JS-pipeline write path in finalize.ts). Using ctx.engineVersion + // here would re-introduce the asymmetry this PR fixes for that case. + const nativeVersionForMeta = ctx.nativeBinaryVersion || CODEGRAPH_VERSION; + setBuildMeta(ctx.db, { + engine: ctx.engineName, + engine_version: nativeVersionForMeta, + codegraph_version: nativeVersionForMeta, + schema_version: String(ctx.schemaVersion), + built_at: new Date().toISOString(), + }); + + info( + `Native build orchestrator completed: ${result.nodeCount ?? 0} nodes, ${result.edgeCount ?? 0} edges, ${result.fileCount ?? 0} files`, + ); + + // ── Post-native structure + analysis ────────────────────────────── + let analysisTiming = { + astMs: +(p.astMs ?? 0), + complexityMs: +(p.complexityMs ?? 0), + cfgMs: +(p.cfgMs ?? 0), + dataflowMs: +(p.dataflowMs ?? 0), + }; + let structurePatchMs = 0; + // Skip JS structure when the Rust pipeline's small-incremental fast path + // already handled it. For full builds and large incrementals where Rust + // skipped structure, we must run the JS fallback. + const needsStructure = !result.structureHandled; + // When the Rust addon doesn't include analysis persistence (older addon + // version or analysis failed), fall back to JS-side analysis. + const needsAnalysisFallback = + !result.analysisComplete && + (ctx.opts.ast !== false || + ctx.opts.complexity !== false || + ctx.opts.cfg !== false || + ctx.opts.dataflow !== false); + + if (needsStructure || needsAnalysisFallback) { + // When analysis fallback is needed, handoff to better-sqlite3 — the + // analysis engine uses the suspend/resume WAL pattern that requires a + // real better-sqlite3 connection, not the NativeDbProxy. + if (needsAnalysisFallback && ctx.nativeFirstProxy) { + closeNativeDb(ctx, 'pre-analysis-fallback'); + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } else if (!ctx.nativeFirstProxy && !handoffWalAfterNativeBuild(ctx)) { + // DB reopen failed — return partial result + return formatNativeTimingResult(p, 0, analysisTiming); + } + + const fileSymbols = reconstructFileSymbolsFromDb(ctx); + + if (needsStructure) { + structurePatchMs = await runPostNativeStructure( + ctx, + fileSymbols, + !!result.isFullBuild, + result.changedFiles, + ); + } + + if (needsAnalysisFallback) { + analysisTiming = await runPostNativeAnalysis(ctx, fileSymbols, result.changedFiles); + } + } + + // Engine parity: the native orchestrator silently drops files whose + // Rust extractor/grammar is missing or fails (e.g. HCL, Scala, Swift on + // stale native binaries). WASM handles those — backfill via WASM so both + // engines process the same file set (#967). + // + // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for + // both gating and the backfill itself. On dirty incrementals/full builds + // the orchestrator signals trigger backfill, so the walk happens once + // (instead of redundantly inside backfill). On quiet incrementals we + // still pay the walk so we can detect brand-new files in dropped-language + // extensions — a gap that the orchestrator's `detect_removed_files` + // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap + // because the expensive part (WASM re-parse of the missing set) is + // gated below. + const removedCount = result.removedCount ?? 0; + const changedCount = result.changedCount ?? 0; + const gap = detectDroppedLanguageGap(ctx); + if ( + result.isFullBuild || + removedCount > 0 || + changedCount > 0 || + gap.missingAbs.length > 0 || + gap.staleRel.length > 0 + ) { + await backfillNativeDroppedFiles(ctx, gap); + } + + closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); + return formatNativeTimingResult(p, structurePatchMs, analysisTiming); +} diff --git a/src/domain/graph/cycles.ts b/src/domain/graph/cycles.ts index 4ccc872f2..bb4d61168 100644 --- a/src/domain/graph/cycles.ts +++ b/src/domain/graph/cycles.ts @@ -3,6 +3,45 @@ import { loadNative } from '../../infrastructure/native.js'; import { isTestFile } from '../../infrastructure/test-filter.js'; import type { BetterSqlite3Database } from '../../types.js'; +type Edge = { source: string; target: string }; +type DbEdge = { source_id: number; target_id: number }; + +/** + * Build a label-based edge list from DB rows, filtering to known nodes and + * deduplicating. Self-loops are skipped (Tarjan treats them as trivial SCCs). + */ +function buildLabelEdges(dbEdges: DbEdge[], idToLabel: Map): Edge[] { + const edges: Edge[] = []; + const seen = new Set(); + for (const e of dbEdges) { + if (e.source_id === e.target_id) continue; + const src = idToLabel.get(e.source_id); + const tgt = idToLabel.get(e.target_id); + if (src === undefined || tgt === undefined) continue; + const key = `${src}\0${tgt}`; + if (seen.has(key)) continue; + seen.add(key); + edges.push({ source: src, target: tgt }); + } + return edges; +} + +function buildFileLevelEdges(db: BetterSqlite3Database, noTests: boolean): Edge[] { + let nodes = getFileNodesAll(db); + if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); + const idToLabel = new Map(); + for (const n of nodes) idToLabel.set(n.id, n.file); + return buildLabelEdges(getImportEdges(db), idToLabel); +} + +function buildCallableEdges(db: BetterSqlite3Database, noTests: boolean): Edge[] { + let nodes = getCallableNodes(db); + if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); + const idToLabel = new Map(); + for (const n of nodes) idToLabel.set(n.id, `${n.name}|${n.file}`); + return buildLabelEdges(getCallEdges(db), idToLabel); +} + /** * Find cycles using Tarjan's SCC algorithm. * @@ -16,66 +55,20 @@ export function findCycles( const fileLevel = opts.fileLevel !== false; const noTests = opts.noTests || false; - const edges: Array<{ source: string; target: string }> = []; - const seen = new Set(); - - if (fileLevel) { - let nodes = getFileNodesAll(db); - if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); - const nodeIds = new Set(); - const idToFile = new Map(); - for (const n of nodes) { - nodeIds.add(n.id); - idToFile.set(n.id, n.file); - } - for (const e of getImportEdges(db)) { - if (!nodeIds.has(e.source_id) || !nodeIds.has(e.target_id)) continue; - if (e.source_id === e.target_id) continue; - const src = idToFile.get(e.source_id)!; - const tgt = idToFile.get(e.target_id)!; - const key = `${src}\0${tgt}`; - if (seen.has(key)) continue; - seen.add(key); - edges.push({ source: src, target: tgt }); - } - } else { - let nodes = getCallableNodes(db); - if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); - const nodeIds = new Set(); - const idToLabel = new Map(); - for (const n of nodes) { - nodeIds.add(n.id); - idToLabel.set(n.id, `${n.name}|${n.file}`); - } - for (const e of getCallEdges(db)) { - if (!nodeIds.has(e.source_id) || !nodeIds.has(e.target_id)) continue; - if (e.source_id === e.target_id) continue; - const src = idToLabel.get(e.source_id)!; - const tgt = idToLabel.get(e.target_id)!; - const key = `${src}\0${tgt}`; - if (seen.has(key)) continue; - seen.add(key); - edges.push({ source: src, target: tgt }); - } - } + const edges = fileLevel ? buildFileLevelEdges(db, noTests) : buildCallableEdges(db, noTests); const native = loadNative(); if (native) { return native.detectCycles(edges) as string[][]; } - return tarjanFromEdges(edges); } -export function findCyclesJS(edges: Array<{ source: string; target: string }>): string[][] { +export function findCyclesJS(edges: Edge[]): string[][] { return tarjanFromEdges(edges); } -/** - * Run Tarjan's SCC on a flat edge list. Returns SCCs with length > 1 (cycles). - * Uses a simple adjacency-list Map instead of a full CodeGraph. - */ -function tarjanFromEdges(edges: Array<{ source: string; target: string }>): string[][] { +function buildAdjacency(edges: Edge[]): { adj: Map; allNodes: Set } { const adj = new Map(); const allNodes = new Set(); for (const { source, target } of edges) { @@ -88,6 +81,15 @@ function tarjanFromEdges(edges: Array<{ source: string; target: string }>): stri } list.push(target); } + return { adj, allNodes }; +} + +/** + * Run Tarjan's SCC on a flat edge list. Returns SCCs with length > 1 (cycles). + * Uses a simple adjacency-list Map instead of a full CodeGraph. + */ +function tarjanFromEdges(edges: Edge[]): string[][] { + const { adj, allNodes } = buildAdjacency(edges); let index = 0; const stack: string[] = []; diff --git a/src/domain/graph/journal.ts b/src/domain/graph/journal.ts index 900e33546..d20c7dab9 100644 --- a/src/domain/graph/journal.ts +++ b/src/domain/graph/journal.ts @@ -91,62 +91,69 @@ function trySteal(lockPath: string): AcquiredLock | null { return { fd, nonce }; } -function acquireJournalLock(lockPath: string): AcquiredLock { - const start = Date.now(); - for (;;) { - const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; +/** + * Try to create the lockfile fresh via `wx`. Returns the acquired lock on + * success, `null` if another holder exists, or throws on unexpected errors. + * + * If the stamp write fails (ENOSPC, I/O error) we release the empty file — + * leaving it would look stale to concurrent waiters and admit double-acquire. + */ +function tryFreshAcquire(lockPath: string): AcquiredLock | null { + const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; + let fd: number; + try { + fd = fs.openSync(lockPath, 'wx'); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === 'EEXIST') return null; + throw e; + } + try { + fs.writeSync(fd, `${process.pid}\n${nonce}\n`); + } catch { try { - const fd = fs.openSync(lockPath, 'wx'); - try { - fs.writeSync(fd, `${process.pid}\n${nonce}\n`); - } catch { - // Stamp write failed (ENOSPC, I/O error). An empty lockfile would - // look stale to concurrent waiters (Number('') === 0, isPidAlive(0) - // returns false), so they'd steal our live lock. Release and retry. - try { - fs.closeSync(fd); - } catch { - /* ignore */ - } - try { - fs.unlinkSync(lockPath); - } catch { - /* ignore */ - } - if (Date.now() - start > LOCK_TIMEOUT_MS) { - throw new Error( - `Failed to acquire journal lock at ${lockPath} within ${LOCK_TIMEOUT_MS}ms`, - ); - } - sleepSync(LOCK_RETRY_MS); - continue; - } - return { fd, nonce }; - } catch (e) { - if ((e as NodeJS.ErrnoException).code !== 'EEXIST') throw e; + fs.closeSync(fd); + } catch { + /* ignore */ } - - let holderAlive = true; try { - const pidContent = fs.readFileSync(lockPath, 'utf-8').split('\n')[0]!.trim(); - holderAlive = isPidAlive(Number(pidContent)); + fs.unlinkSync(lockPath); } catch { - /* unreadable — fall through to age check */ + /* ignore */ } + return null; + } + return { fd, nonce }; +} - let shouldSteal = !holderAlive; - if (holderAlive) { - try { - const stat = fs.statSync(lockPath); - if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { - shouldSteal = true; - } - } catch { - /* stat failed — keep retrying */ - } - } +/** + * Decide whether the current lock holder is stale and should be stolen. + * Returns true if the PID is dead, or if the lockfile mtime exceeds the + * staleness threshold. + */ +function isLockStale(lockPath: string): boolean { + let holderAlive = true; + try { + const pidContent = fs.readFileSync(lockPath, 'utf-8').split('\n')[0]!.trim(); + holderAlive = isPidAlive(Number(pidContent)); + } catch { + /* unreadable — fall through to age check */ + } + if (!holderAlive) return true; + try { + const stat = fs.statSync(lockPath); + return Date.now() - stat.mtimeMs > LOCK_STALE_MS; + } catch { + return false; + } +} - if (shouldSteal) { +function acquireJournalLock(lockPath: string): AcquiredLock { + const start = Date.now(); + for (;;) { + const fresh = tryFreshAcquire(lockPath); + if (fresh) return fresh; + + if (isLockStale(lockPath)) { const stolen = trySteal(lockPath); if (stolen) return stolen; // Steal failed or lost the race — fall through to timeout check & retry. @@ -227,27 +234,20 @@ interface JournalResult { removed?: string[]; } -export function readJournal(rootDir: string): JournalResult { - const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); - let content: string; - try { - content = fs.readFileSync(journalPath, 'utf-8'); - } catch { - return { valid: false }; - } - - const lines = content.split('\n'); - if (lines.length === 0 || !lines[0]!.startsWith(HEADER_PREFIX)) { +function parseJournalHeader(firstLine: string | undefined): number | null { + if (!firstLine || !firstLine.startsWith(HEADER_PREFIX)) { debug('Journal has malformed or missing header'); - return { valid: false }; + return null; } - - const timestamp = Number(lines[0]!.slice(HEADER_PREFIX.length).trim()); + const timestamp = Number(firstLine.slice(HEADER_PREFIX.length).trim()); if (!Number.isFinite(timestamp) || timestamp <= 0) { debug('Journal has invalid timestamp'); - return { valid: false }; + return null; } + return timestamp; +} +function parseJournalBody(lines: string[]): { changed: string[]; removed: string[] } { const changed: string[] = []; const removed: string[] = []; const seenChanged = new Set(); @@ -263,14 +263,29 @@ export function readJournal(rootDir: string): JournalResult { seenRemoved.add(filePath); removed.push(filePath); } - } else { - if (!seenChanged.has(line)) { - seenChanged.add(line); - changed.push(line); - } + } else if (!seenChanged.has(line)) { + seenChanged.add(line); + changed.push(line); } } + return { changed, removed }; +} + +export function readJournal(rootDir: string): JournalResult { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); + let content: string; + try { + content = fs.readFileSync(journalPath, 'utf-8'); + } catch { + return { valid: false }; + } + + const lines = content.split('\n'); + const timestamp = parseJournalHeader(lines[0]); + if (timestamp === null) return { valid: false }; + + const { changed, removed } = parseJournalBody(lines); return { valid: true, timestamp, changed, removed }; } diff --git a/src/domain/parser.ts b/src/domain/parser.ts index acf4e9c04..38ebc035a 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -571,25 +571,36 @@ export function classifyNativeDrops(relPaths: Iterable): NativeDropClass } /** - * Render `{ ext → paths[] }` as `ext (n: sample.ext, ...)` slices for log lines. - * Caps at 3 sample paths per extension and 6 extensions total to keep warnings - * readable when many languages are dropped at once. Extensions are sorted by - * descending file count so the loudest offender shows up first; ties keep - * insertion order. Pure function — safe to unit-test independently. + * Render `{ ext → paths[] }` as a multi-line tabular breakdown for log lines. + * Each extension occupies its own line so a long warning scans like a table + * instead of a wall of semicolon-separated slices. Caps at 3 sample paths per + * extension and 6 extensions total to keep output bounded when many languages + * are dropped at once. Extensions are sorted by descending file count so the + * loudest offender shows up first; ties keep insertion order. + * + * Returns the empty string for empty input, and otherwise a string that + * begins with `\n` so callers can append it directly after the header line + * (`"Backfilling via WASM:" + formatDropExtensionSummary(...)`). + * + * Pure function — safe to unit-test independently. */ export function formatDropExtensionSummary(buckets: Map): string { const MAX_EXTS = 6; const MAX_SAMPLES = 3; const entries = Array.from(buckets.entries()).sort((a, b) => b[1].length - a[1].length); - const shown = entries.slice(0, MAX_EXTS).map(([ext, paths]) => { + if (entries.length === 0) return ''; + const shown = entries.slice(0, MAX_EXTS); + const extWidth = Math.max(...shown.map(([ext]) => ext.length)); + const countWidth = Math.max(...shown.map(([, paths]) => String(paths.length).length)); + const lines = shown.map(([ext, paths]) => { const sample = paths.slice(0, MAX_SAMPLES).join(', '); - const more = paths.length > MAX_SAMPLES ? `, +${paths.length - MAX_SAMPLES} more` : ''; - return `${ext} (${paths.length}: ${sample}${more})`; + const more = paths.length > MAX_SAMPLES ? ` (+${paths.length - MAX_SAMPLES} more)` : ''; + return ` ${ext.padEnd(extWidth)} ${String(paths.length).padStart(countWidth)} ${sample}${more}`; }); if (entries.length > MAX_EXTS) { - shown.push(`+${entries.length - MAX_EXTS} more extension(s)`); + lines.push(` (+${entries.length - MAX_EXTS} more extension(s))`); } - return shown.join('; '); + return `\n${lines.join('\n')}`; } // ── Unified API ────────────────────────────────────────────────────────────── diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index 9c380870b..0eb89c854 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -89,25 +89,36 @@ describe('formatDropExtensionSummary', () => { expect(formatDropExtensionSummary(new Map())).toBe(''); }); - it('lists every extension when under the cap', () => { + it('renders one indented row per extension prefixed with a leading newline', () => { const buckets = new Map([ ['.ts', ['a.ts', 'b.ts']], ['.py', ['c.py']], ]); - expect(formatDropExtensionSummary(buckets)).toBe('.ts (2: a.ts, b.ts); .py (1: c.py)'); + expect(formatDropExtensionSummary(buckets)).toBe('\n .ts 2 a.ts, b.ts\n .py 1 c.py'); }); it('caps samples per extension at 3 and renders +N more', () => { const buckets = new Map([['.ts', ['a.ts', 'b.ts', 'c.ts', 'd.ts', 'e.ts']]]); - expect(formatDropExtensionSummary(buckets)).toBe('.ts (5: a.ts, b.ts, c.ts, +2 more)'); + expect(formatDropExtensionSummary(buckets)).toBe('\n .ts 5 a.ts, b.ts, c.ts (+2 more)'); }); it('shows exactly MAX_SAMPLES samples without a +N suffix when count equals the cap', () => { const buckets = new Map([['.ts', ['a.ts', 'b.ts', 'c.ts']]]); - expect(formatDropExtensionSummary(buckets)).toBe('.ts (3: a.ts, b.ts, c.ts)'); + expect(formatDropExtensionSummary(buckets)).toBe('\n .ts 3 a.ts, b.ts, c.ts'); }); - it('caps extensions at 6 and renders +N more extension(s)', () => { + it('right-pads the extension column and right-aligns the count column for tabular layout', () => { + const buckets = new Map([ + ['.kt', ['a.kt']], + ['.tsx', new Array(100).fill('x.tsx')], // 100 files — sets wider count column + ]); + const out = formatDropExtensionSummary(buckets); + // `.tsx` (4 chars) sets the ext width; `.kt` is padded to 4 chars. + // 100 (3 chars) sets the count width; 1 is right-aligned to 3 chars. + expect(out).toBe('\n .tsx 100 x.tsx, x.tsx, x.tsx (+97 more)\n .kt 1 a.kt'); + }); + + it('caps extensions at 6 and renders +N more extension(s) on its own row', () => { // 8 extensions, all with 1 file — sorted by count is a stable tie so insertion // order wins, and the first 6 are shown. const buckets = new Map([ @@ -121,12 +132,12 @@ describe('formatDropExtensionSummary', () => { ['.h', ['1.h']], ]); const out = formatDropExtensionSummary(buckets); - expect(out.endsWith('; +2 more extension(s)')).toBe(true); + expect(out.endsWith('\n (+2 more extension(s))')).toBe(true); // First 6 extensions are present, the last 2 (.g, .h) are not. - expect(out).toContain('.a (1: 1.a)'); - expect(out).toContain('.f (1: 1.f)'); - expect(out).not.toContain('.g ('); - expect(out).not.toContain('.h ('); + expect(out).toContain('\n .a 1 1.a'); + expect(out).toContain('\n .f 1 1.f'); + expect(out).not.toContain(' .g '); + expect(out).not.toContain(' .h '); }); it('sorts by descending file count so the loudest offender is first', () => {