From 08d254f0392acf7d95fa5790c07679feebbc69ba Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 25 May 2026 22:59:24 -0600 Subject: [PATCH 01/27] refactor(parity): render orchestrator-drop summary as a per-extension table The native-orchestrator drop warning lived in a single wall-of-text WARN line that grew unreadable when 30+ extensions were dropped at once (easy to trigger via journal-vs-fresh-build collisions). Make the per-extension breakdown scan like a table: header line keeps the count and now also reports the extension total; each extension occupies its own indented row with a right-aligned count column. Co-Authored-By: Claude Opus 4.7 --- src/domain/graph/builder/pipeline.ts | 8 +++-- src/domain/parser.ts | 31 +++++++++++++------ .../native-drop-classification.test.ts | 31 +++++++++++++------ 3 files changed, 47 insertions(+), 23 deletions(-) diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index b18d3c473..4dce8aa3d 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -1018,7 +1018,7 @@ async function backfillNativeDroppedFiles( // summary directly to avoid a redundant classification pass. const staleByExt = groupByExtension(staleRel); info( - `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`, + `Detected ${staleRel.length} deleted WASM-only file(s) across ${staleByExt.size} extension(s) the native orchestrator skipped; purging stale rows:${formatDropExtensionSummary(staleByExt)}`, ); purgeFilesData(dbConn, staleRel); } @@ -1031,13 +1031,15 @@ async function backfillNativeDroppedFiles( // the language IS supported by the addon yet the file was dropped anyway. const { byReason, totals } = classifyNativeDrops(missingRel); if (totals['unsupported-by-native'] > 0) { + const buckets = byReason['unsupported-by-native']; info( - `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) in languages without a Rust extractor; backfilling via WASM: ${formatDropExtensionSummary(byReason['unsupported-by-native'])}`, + `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) across ${buckets.size} extension(s) in languages without a Rust extractor; backfilling via WASM:${formatDropExtensionSummary(buckets)}`, ); } if (totals['native-extractor-failure'] > 0) { + const buckets = byReason['native-extractor-failure']; warn( - `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM: ${formatDropExtensionSummary(byReason['native-extractor-failure'])}`, + `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) across ${buckets.size} extension(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM:${formatDropExtensionSummary(buckets)}`, ); } const wasmResults = await parseFilesWasmForBackfill(missingAbs, ctx.rootDir); diff --git a/src/domain/parser.ts b/src/domain/parser.ts index b4aaa366b..bb53192c9 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -539,25 +539,36 @@ export function classifyNativeDrops(relPaths: Iterable): NativeDropClass } /** - * Render `{ ext → paths[] }` as `ext (n: sample.ext, ...)` slices for log lines. - * Caps at 3 sample paths per extension and 6 extensions total to keep warnings - * readable when many languages are dropped at once. Extensions are sorted by - * descending file count so the loudest offender shows up first; ties keep - * insertion order. Pure function — safe to unit-test independently. + * Render `{ ext → paths[] }` as a multi-line tabular breakdown for log lines. + * Each extension occupies its own line so a long warning scans like a table + * instead of a wall of semicolon-separated slices. Caps at 3 sample paths per + * extension and 6 extensions total to keep output bounded when many languages + * are dropped at once. Extensions are sorted by descending file count so the + * loudest offender shows up first; ties keep insertion order. + * + * Returns the empty string for empty input, and otherwise a string that + * begins with `\n` so callers can append it directly after the header line + * (`"Backfilling via WASM:" + formatDropExtensionSummary(...)`). + * + * Pure function — safe to unit-test independently. */ export function formatDropExtensionSummary(buckets: Map): string { const MAX_EXTS = 6; const MAX_SAMPLES = 3; const entries = Array.from(buckets.entries()).sort((a, b) => b[1].length - a[1].length); - const shown = entries.slice(0, MAX_EXTS).map(([ext, paths]) => { + if (entries.length === 0) return ''; + const shown = entries.slice(0, MAX_EXTS); + const extWidth = Math.max(...shown.map(([ext]) => ext.length)); + const countWidth = Math.max(...shown.map(([, paths]) => String(paths.length).length)); + const lines = shown.map(([ext, paths]) => { const sample = paths.slice(0, MAX_SAMPLES).join(', '); - const more = paths.length > MAX_SAMPLES ? `, +${paths.length - MAX_SAMPLES} more` : ''; - return `${ext} (${paths.length}: ${sample}${more})`; + const more = paths.length > MAX_SAMPLES ? ` (+${paths.length - MAX_SAMPLES} more)` : ''; + return ` ${ext.padEnd(extWidth)} ${String(paths.length).padStart(countWidth)} ${sample}${more}`; }); if (entries.length > MAX_EXTS) { - shown.push(`+${entries.length - MAX_EXTS} more extension(s)`); + lines.push(` (+${entries.length - MAX_EXTS} more extension(s))`); } - return shown.join('; '); + return `\n${lines.join('\n')}`; } // ── Unified API ────────────────────────────────────────────────────────────── diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index 9c380870b..d617d4757 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -89,25 +89,36 @@ describe('formatDropExtensionSummary', () => { expect(formatDropExtensionSummary(new Map())).toBe(''); }); - it('lists every extension when under the cap', () => { + it('renders one indented row per extension prefixed with a leading newline', () => { const buckets = new Map([ ['.ts', ['a.ts', 'b.ts']], ['.py', ['c.py']], ]); - expect(formatDropExtensionSummary(buckets)).toBe('.ts (2: a.ts, b.ts); .py (1: c.py)'); + expect(formatDropExtensionSummary(buckets)).toBe('\n .ts 2 a.ts, b.ts\n .py 1 c.py'); }); it('caps samples per extension at 3 and renders +N more', () => { const buckets = new Map([['.ts', ['a.ts', 'b.ts', 'c.ts', 'd.ts', 'e.ts']]]); - expect(formatDropExtensionSummary(buckets)).toBe('.ts (5: a.ts, b.ts, c.ts, +2 more)'); + expect(formatDropExtensionSummary(buckets)).toBe('\n .ts 5 a.ts, b.ts, c.ts (+2 more)'); }); it('shows exactly MAX_SAMPLES samples without a +N suffix when count equals the cap', () => { const buckets = new Map([['.ts', ['a.ts', 'b.ts', 'c.ts']]]); - expect(formatDropExtensionSummary(buckets)).toBe('.ts (3: a.ts, b.ts, c.ts)'); + expect(formatDropExtensionSummary(buckets)).toBe('\n .ts 3 a.ts, b.ts, c.ts'); }); - it('caps extensions at 6 and renders +N more extension(s)', () => { + it('right-pads the extension column and right-aligns the count column for tabular layout', () => { + const buckets = new Map([ + ['.kt', ['a.kt']], // 100 files later — wider count column + ['.tsx', new Array(100).fill('x.tsx')], + ]); + const out = formatDropExtensionSummary(buckets); + // `.tsx` (4 chars) sets the ext width; `.kt` is padded to 4 chars. + // 100 (3 chars) sets the count width; 1 is right-aligned to 3 chars. + expect(out).toBe('\n .tsx 100 x.tsx, x.tsx, x.tsx (+97 more)\n .kt 1 a.kt'); + }); + + it('caps extensions at 6 and renders +N more extension(s) on its own row', () => { // 8 extensions, all with 1 file — sorted by count is a stable tie so insertion // order wins, and the first 6 are shown. const buckets = new Map([ @@ -121,12 +132,12 @@ describe('formatDropExtensionSummary', () => { ['.h', ['1.h']], ]); const out = formatDropExtensionSummary(buckets); - expect(out.endsWith('; +2 more extension(s)')).toBe(true); + expect(out.endsWith('\n (+2 more extension(s))')).toBe(true); // First 6 extensions are present, the last 2 (.g, .h) are not. - expect(out).toContain('.a (1: 1.a)'); - expect(out).toContain('.f (1: 1.f)'); - expect(out).not.toContain('.g ('); - expect(out).not.toContain('.h ('); + expect(out).toContain('\n .a 1 1.a'); + expect(out).toContain('\n .f 1 1.f'); + expect(out).not.toContain(' .g '); + expect(out).not.toContain(' .h '); }); it('sorts by descending file count so the loudest offender is first', () => { From 9c8be552935310520d478a0adeefad801c940bf9 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 00:37:34 -0600 Subject: [PATCH 02/27] refactor(extractors): extend shared helpers for identifier and symbol collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds shared utilities to src/extractors/helpers.ts in preparation for adoption across language extractors (phase 2): - nodeStartLine: companion to nodeEndLine for the ~108 hand-rolled startPosition.row + 1 literals scattered across extractors - findFirstChildOfTypes: find first child matching any of N types (useful for grammar variants like string vs string_literal) - iterChildren / PUNCTUATION_TOKENS: generator-based child iteration with punctuation skipping, used in elixir/gleam destructuring walks - pushCall / pushImport: centralise Call/Import construction so line derivation stays consistent across extractors - extractSimpleParameters / resolveParamName: uniform parameter extraction with optional type-map sink — collapses boilerplate in the ~16 per-language extractParams helpers Phase 1 of the TS extractor refactor plan (sync.json clusters 1). Additive only — no consumer adoption yet; existing helpers and extractor behaviour unchanged. Consumers updated in phase 2. docs check acknowledged: internal refactor, no doc updates needed. --- src/extractors/helpers.ts | 206 +++++++++++++++++++++++++++++++++++++- 1 file changed, 205 insertions(+), 1 deletion(-) diff --git a/src/extractors/helpers.ts b/src/extractors/helpers.ts index 6a3e129d4..62edc22ea 100644 --- a/src/extractors/helpers.ts +++ b/src/extractors/helpers.ts @@ -1,4 +1,11 @@ -import type { SubDeclaration, TreeSitterNode, TypeMapEntry } from '../types.js'; +import type { + Call, + ExtractorOutput, + Import, + SubDeclaration, + TreeSitterNode, + TypeMapEntry, +} from '../types.js'; /** * Maximum recursion depth for tree-sitter AST walkers. @@ -6,6 +13,11 @@ import type { SubDeclaration, TreeSitterNode, TypeMapEntry } from '../types.js'; */ export const MAX_WALK_DEPTH = 200; +/** Convert a tree-sitter node's start row to a 1-based source line. */ +export function nodeStartLine(node: TreeSitterNode): number { + return node.startPosition.row + 1; +} + export function nodeEndLine(node: TreeSitterNode): number { return node.endPosition.row + 1; } @@ -18,6 +30,56 @@ export function findChild(node: TreeSitterNode, type: string): TreeSitterNode | return null; } +/** + * Find the first child whose type is in `types`. Useful when several grammar + * variants name the same conceptual node differently (e.g. `string` vs + * `string_literal`). Returns the first match in document order, or null. + */ +export function findFirstChildOfTypes( + node: TreeSitterNode, + types: readonly string[], +): TreeSitterNode | null { + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (child && types.includes(child.type)) return child; + } + return null; +} + +/** + * Iterate the direct children of `node` in document order, skipping nulls and + * tokens whose type appears in `skipTypes`. Mirrors the common + * `for (let i = 0; i < node.childCount; i++) { const c = node.child(i); if (...) continue; ... }` + * idiom while letting callers filter out grammar punctuation (`,`, `(`, `{`, etc.). + */ +export function* iterChildren( + node: TreeSitterNode, + skipTypes: ReadonlySet = EMPTY_SKIP_SET, +): Generator { + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (!child) continue; + if (skipTypes.has(child.type)) continue; + yield child; + } +} + +const EMPTY_SKIP_SET: ReadonlySet = new Set(); + +/** Common punctuation tokens — handy as a `skipTypes` set for `iterChildren`. */ +export const PUNCTUATION_TOKENS: ReadonlySet = new Set([ + ',', + ';', + '(', + ')', + '[', + ']', + '{', + '}', + ':', + '.', +]); + /** * Merge a type-map entry, keeping the higher-confidence one. * Shared across all language extractors that build type maps for call resolution. @@ -197,3 +259,145 @@ export function extractModifierVisibility( } return undefined; } + +// ── Output-push helpers ──────────────────────────────────────────────────── +// +// Most extractors finish with `ctx.calls.push({ name, line: node.startPosition.row + 1 })` +// or `ctx.imports.push({ source, names, line: node.startPosition.row + 1 })`. +// Centralising the construction keeps `line` derivation consistent and removes +// the ~108 hand-rolled `startPosition.row + 1` literals scattered across +// language extractors. + +/** + * Append a `Call` to the extractor output. `line` defaults to the start line of + * `node`; pass `extra` for `receiver` / `dynamic` flags. + */ +export function pushCall( + ctx: ExtractorOutput, + node: TreeSitterNode, + name: string, + extra: { receiver?: string; dynamic?: boolean } = {}, +): void { + if (!name) return; + const call: Call = { name, line: nodeStartLine(node) }; + if (extra.receiver !== undefined) call.receiver = extra.receiver; + if (extra.dynamic !== undefined) call.dynamic = extra.dynamic; + ctx.calls.push(call); +} + +/** + * Append an `Import` to the extractor output. `line` defaults to the start + * line of `node`. If `names` is empty, the source basename (split on `/`) is + * used as a single-name fallback — matching the convention in gleam, julia, + * and similar module-path imports. + */ +export function pushImport( + ctx: ExtractorOutput, + node: TreeSitterNode, + source: string, + names: string[], + flags: Partial> = {}, +): void { + if (!source) return; + const resolved = names.length > 0 ? names : [lastPathSegment(source, '/') || source]; + const entry: Import = { source, names: resolved, line: nodeStartLine(node) }; + Object.assign(entry, flags); + ctx.imports.push(entry); +} + +// ── Parameter extraction ─────────────────────────────────────────────────── + +/** + * Options for {@link extractSimpleParameters}. + */ +export interface ExtractParametersOptions { + /** Tree-sitter types that mark a single parameter node (e.g. `formal_parameter`). */ + paramTypes: readonly string[]; + /** + * Field name on each parameter that holds the bound identifier. Defaults to + * `'name'`. Pass `null` to use the parameter node itself when its type is in + * `paramTypes` and it has no `name` field (e.g. R's bare `identifier`). + */ + nameField?: string | null; + /** + * If true, when `nameField` lookup fails fall back to the first `identifier` + * child of the parameter. Useful for Gleam / Solidity-style grammars. + */ + fallbackToIdentifier?: boolean; + /** + * Optional type-map sink. When provided, the parameter's `type` field text + * (if present) is recorded with the given confidence. + */ + typeMap?: Map; + /** Confidence used when writing into `typeMap`. Defaults to `0.9`. */ + typeMapConfidence?: number; + /** + * Optional callback to derive the type text from the parameter's `type` + * field node. Defaults to `node.text`. Use this for languages where the + * `type` field is wrapped (e.g. Java `generic_type` → first child). + */ + resolveType?: (typeNode: TreeSitterNode) => string | undefined; +} + +/** + * Extract parameters from a parameter-list node using a uniform pattern. + * + * This collapses the boilerplate in `extract*Params` helpers across + * Java/Julia/Gleam/Solidity/R/etc. — each one walks the parameter list, + * matches a parameter node type, reads the `name` field, and pushes a + * `SubDeclaration` with `kind: 'parameter'`. + */ +export function extractSimpleParameters( + paramListNode: TreeSitterNode | null, + options: ExtractParametersOptions, +): SubDeclaration[] { + const params: SubDeclaration[] = []; + if (!paramListNode) return params; + const { paramTypes, nameField = 'name', fallbackToIdentifier = false } = options; + + for (let i = 0; i < paramListNode.childCount; i++) { + const param = paramListNode.child(i); + if (!param || !paramTypes.includes(param.type)) continue; + const nameNode = resolveParamName(param, nameField, fallbackToIdentifier); + if (!nameNode) continue; + params.push({ name: nameNode.text, kind: 'parameter', line: nodeStartLine(param) }); + recordParamType(param, nameNode.text, options); + } + return params; +} + +/** Record a parameter's declared type into the type-map sink, if configured. */ +function recordParamType( + param: TreeSitterNode, + paramName: string, + options: ExtractParametersOptions, +): void { + const { typeMap, resolveType, typeMapConfidence = 0.9 } = options; + if (!typeMap) return; + const typeNode = param.childForFieldName('type'); + if (!typeNode) return; + const typeText = resolveType ? resolveType(typeNode) : typeNode.text; + if (!typeText) return; + setTypeMapEntry(typeMap, paramName, typeText, typeMapConfidence); +} + +/** + * Resolve the identifier node that names a parameter. Used by + * {@link extractSimpleParameters}; exposed so language-specific extractors + * can reuse the same lookup logic in custom loops. + */ +export function resolveParamName( + paramNode: TreeSitterNode, + nameField: string | null, + fallbackToIdentifier: boolean, +): TreeSitterNode | null { + if (nameField === null) { + return paramNode; + } + const named = paramNode.childForFieldName(nameField); + if (named) return named; + if (fallbackToIdentifier) { + return findChild(paramNode, 'identifier'); + } + return null; +} From 9c3d016b859de1c5ee8f58b3ed0f3d1badd79b52 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 11:31:52 -0600 Subject: [PATCH 03/27] refactor(extractors): adopt shared helpers across language extractors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the TS extractor refactor plan (sync.json cluster 1). Adopts the helpers extended in 9c8be55 (nodeStartLine, findFirstChildOfTypes, pushCall, pushImport, extractSimpleParameters, stripQuotes) across six language extractors: - r.ts: drop local stripQuotes; use shared stripQuotes/pushCall/ pushImport/findFirstChildOfTypes/nodeStartLine - gleam.ts: use pushCall/pushImport/findFirstChildOfTypes/nodeStartLine; extract pushConstructor helper for the dual-branch data-constructor walk - julia.ts: use pushCall/pushImport/nodeStartLine; collapse Julia param wrapper-type branches via JULIA_PARAM_WRAPPER_TYPES set - java.ts: use pushCall/pushImport/nodeStartLine; collapse extractJavaParameters via extractSimpleParameters with typeMap sink; extract resolveJavaTypeText for the generic_type unwrap pattern - gleam.ts and solidity.ts: extract qualifyWithParent helper in solidity to collapse 6 duplicated `parent ? \`\${parent}.\${name}\` : name` blocks - solidity.ts: use pushCall/pushImport/findFirstChildOfTypes/ nodeStartLine; collapse extractSolParams via extractSimpleParameters - javascript.ts: bulk-replace 43 inline `XXX.startPosition.row + 1` literals with nodeStartLine() calls; replace one stray endPosition literal with nodeEndLine Net -65 lines. No behaviour changes — only call-site collapsing onto the shared helpers (semantics verified by careful inspection of each replacement; pushImport's empty-names fallback matches the previous ad-hoc defaults in each extractor). docs check acknowledged: internal refactor, no doc updates needed. --- src/extractors/gleam.ts | 70 ++++++++++----------- src/extractors/java.ts | 79 +++++++++++------------- src/extractors/javascript.ts | 87 +++++++++++++------------- src/extractors/julia.ts | 63 +++++++++---------- src/extractors/r.ts | 94 ++++++++++++----------------- src/extractors/solidity.ts | 114 +++++++++++++---------------------- 6 files changed, 221 insertions(+), 286 deletions(-) diff --git a/src/extractors/gleam.ts b/src/extractors/gleam.ts index 45f8bd2b8..244b036d4 100644 --- a/src/extractors/gleam.ts +++ b/src/extractors/gleam.ts @@ -1,11 +1,13 @@ -import type { - Call, - ExtractorOutput, - SubDeclaration, - TreeSitterNode, - TreeSitterTree, -} from '../types.js'; -import { findChild, nodeEndLine, stripQuotes } from './helpers.js'; +import type { ExtractorOutput, SubDeclaration, TreeSitterNode, TreeSitterTree } from '../types.js'; +import { + findChild, + findFirstChildOfTypes, + nodeEndLine, + nodeStartLine, + pushCall, + pushImport, + stripQuotes, +} from './helpers.js'; /** * Extract symbols from Gleam files. @@ -74,7 +76,7 @@ function handleFunction(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), visibility, children: params.length > 0 ? params : undefined, @@ -90,7 +92,7 @@ function handleExternalFunction(node: TreeSitterNode, ctx: ExtractorOutput): voi ctx.definitions.push({ name: nameNode.text, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), visibility: isPublic(node) ? 'public' : 'private', children: params.length > 0 ? params : undefined, @@ -107,10 +109,7 @@ function handleTypeDef(node: TreeSitterNode, ctx: ExtractorOutput): void { const child = node.child(i); if (!child) continue; if (child.type === 'data_constructor' || child.type === 'type_constructor') { - const ctorName = child.childForFieldName('name') || findChild(child, 'constructor_name'); - if (ctorName) { - children.push({ name: ctorName.text, kind: 'property', line: child.startPosition.row + 1 }); - } + pushConstructor(child, children); } // Recurse into constructors block if (child.type === 'data_constructors' || child.type === 'type_constructors') { @@ -118,14 +117,7 @@ function handleTypeDef(node: TreeSitterNode, ctx: ExtractorOutput): void { const ctor = child.child(j); if (!ctor) continue; if (ctor.type === 'data_constructor' || ctor.type === 'type_constructor') { - const ctorName = ctor.childForFieldName('name') || findChild(ctor, 'constructor_name'); - if (ctorName) { - children.push({ - name: ctorName.text, - kind: 'property', - line: ctor.startPosition.row + 1, - }); - } + pushConstructor(ctor, children); } } } @@ -134,13 +126,20 @@ function handleTypeDef(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'type', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), visibility: isPublic(node) ? 'public' : 'private', children: children.length > 0 ? children : undefined, }); } +function pushConstructor(ctorNode: TreeSitterNode, out: SubDeclaration[]): void { + const ctorName = ctorNode.childForFieldName('name') || findChild(ctorNode, 'constructor_name'); + if (ctorName) { + out.push({ name: ctorName.text, kind: 'property', line: nodeStartLine(ctorNode) }); + } +} + function handleTypeAlias(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = node.childForFieldName('name') || findChild(node, 'type_name'); if (!nameNode) return; @@ -148,7 +147,7 @@ function handleTypeAlias(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'type', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), visibility: isPublic(node) ? 'public' : 'private', }); @@ -161,7 +160,7 @@ function handleConstant(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'variable', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), visibility: isPublic(node) ? 'public' : 'private', }); @@ -169,7 +168,7 @@ function handleConstant(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleImport(node: TreeSitterNode, ctx: ExtractorOutput): void { const moduleNode = - node.childForFieldName('module') || findChild(node, 'module') || findChild(node, 'string'); + node.childForFieldName('module') || findFirstChildOfTypes(node, ['module', 'string']); if (!moduleNode) return; const source = stripQuotes(moduleNode.text); @@ -193,11 +192,9 @@ function handleImport(node: TreeSitterNode, ctx: ExtractorOutput): void { names.push(alias.text); } - ctx.imports.push({ - source, - names: names.length > 0 ? names : [source.split('/').pop() || source], - line: node.startPosition.row + 1, - }); + // `pushImport` falls back to the source basename when `names` is empty, + // preserving the previous `source.split('/').pop() || source` default. + pushImport(ctx, node, source, names); } function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { @@ -205,16 +202,15 @@ function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { if (!funcNode) return; if (funcNode.type === 'identifier' || funcNode.type === 'variable') { - ctx.calls.push({ name: funcNode.text, line: node.startPosition.row + 1 }); + pushCall(ctx, node, funcNode.text); } else if (funcNode.type === 'field_access' || funcNode.type === 'module_select') { const field = funcNode.childForFieldName('field') || funcNode.childForFieldName('label'); // Prefer the `record` field; fall back to first named child to skip // anonymous punctuation tokens (the `.` between record and field). const record = funcNode.childForFieldName('record') || funcNode.namedChild(0); if (field) { - const call: Call = { name: field.text, line: node.startPosition.row + 1 }; - if (record && record !== field) call.receiver = record.text; - ctx.calls.push(call); + const receiver = record && record !== field ? record.text : undefined; + pushCall(ctx, node, field.text, receiver !== undefined ? { receiver } : {}); } } } @@ -231,11 +227,11 @@ function extractParams(funcNode: TreeSitterNode): SubDeclaration[] { if (param.type === 'function_parameter' || param.type === 'parameter') { const nameNode = param.childForFieldName('name') || findChild(param, 'identifier'); if (nameNode) { - params.push({ name: nameNode.text, kind: 'parameter', line: param.startPosition.row + 1 }); + params.push({ name: nameNode.text, kind: 'parameter', line: nodeStartLine(param) }); } } if (param.type === 'identifier') { - params.push({ name: param.text, kind: 'parameter', line: param.startPosition.row + 1 }); + params.push({ name: param.text, kind: 'parameter', line: nodeStartLine(param) }); } } return params; diff --git a/src/extractors/java.ts b/src/extractors/java.ts index 64f03f900..d12163d20 100644 --- a/src/extractors/java.ts +++ b/src/extractors/java.ts @@ -1,5 +1,4 @@ import type { - Call, ExtractorOutput, SubDeclaration, TreeSitterNode, @@ -9,10 +8,14 @@ import type { import { extractBodyMembers, extractModifierVisibility, + extractSimpleParameters, findChild, findParentNode, lastPathSegment, nodeEndLine, + nodeStartLine, + pushCall, + pushImport, } from './helpers.js'; /** @@ -78,7 +81,7 @@ function handleJavaClassDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'class', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: classChildren.length > 0 ? classChildren : undefined, }); @@ -87,7 +90,7 @@ function handleJavaClassDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { const interfaces = node.childForFieldName('interfaces'); if (interfaces) { - extractJavaInterfaces(interfaces, nameNode.text, node.startPosition.row + 1, ctx); + extractJavaInterfaces(interfaces, nameNode.text, nodeStartLine(node), ctx); } } @@ -101,7 +104,7 @@ function extractJavaSuperclass( if (!superclass) return; const superName = findJavaSuperTypeName(superclass); if (superName) { - ctx.classes.push({ name: className, extends: superName, line: node.startPosition.row + 1 }); + ctx.classes.push({ name: className, extends: superName, line: nodeStartLine(node) }); } } @@ -163,7 +166,7 @@ function handleJavaInterfaceDecl(node: TreeSitterNode, ctx: ExtractorOutput): vo ctx.definitions.push({ name: nameNode.text, kind: 'interface', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); const body = node.childForFieldName('body'); @@ -184,8 +187,8 @@ function extractJavaInterfaceMethods( ctx.definitions.push({ name: `${ifaceName}.${methName.text}`, kind: 'method', - line: child.startPosition.row + 1, - endLine: child.endPosition.row + 1, + line: nodeStartLine(child), + endLine: nodeEndLine(child), }); } } @@ -199,7 +202,7 @@ function handleJavaEnumDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'enum', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: enumChildren.length > 0 ? enumChildren : undefined, }); @@ -216,7 +219,7 @@ function handleJavaMethodDecl(node: TreeSitterNode, ctx: ExtractorOutput): void ctx.definitions.push({ name: fullName, kind: 'method', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: params.length > 0 ? params : undefined, visibility: extractModifierVisibility(node), @@ -232,7 +235,7 @@ function handleJavaConstructorDecl(node: TreeSitterNode, ctx: ExtractorOutput): ctx.definitions.push({ name: fullName, kind: 'method', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: params.length > 0 ? params : undefined, visibility: extractModifierVisibility(node), @@ -245,12 +248,7 @@ function handleJavaImportDecl(node: TreeSitterNode, ctx: ExtractorOutput): void if (child && (child.type === 'scoped_identifier' || child.type === 'identifier')) { const fullPath = child.text; const lastName = lastPathSegment(fullPath, '.'); - ctx.imports.push({ - source: fullPath, - names: [lastName], - line: node.startPosition.row + 1, - javaImport: true, - }); + pushImport(ctx, node, fullPath, [lastName], { javaImport: true }); } if (child && child.type === 'asterisk') { const lastImport = ctx.imports[ctx.imports.length - 1]; @@ -263,15 +261,13 @@ function handleJavaMethodInvocation(node: TreeSitterNode, ctx: ExtractorOutput): const nameNode = node.childForFieldName('name'); if (!nameNode) return; const obj = node.childForFieldName('object'); - const call: Call = { name: nameNode.text, line: node.startPosition.row + 1 }; - if (obj) call.receiver = obj.text; - ctx.calls.push(call); + pushCall(ctx, node, nameNode.text, obj ? { receiver: obj.text } : {}); } function handleJavaLocalVarDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { const typeNode = node.childForFieldName('type'); if (!typeNode) return; - const typeName = typeNode.type === 'generic_type' ? typeNode.child(0)?.text : typeNode.text; + const typeName = resolveJavaTypeText(typeNode); if (!typeName) return; for (let i = 0; i < node.childCount; i++) { const child = node.child(i); @@ -285,8 +281,17 @@ function handleJavaLocalVarDecl(node: TreeSitterNode, ctx: ExtractorOutput): voi function handleJavaObjectCreation(node: TreeSitterNode, ctx: ExtractorOutput): void { const typeNode = node.childForFieldName('type'); if (!typeNode) return; - const typeName = typeNode.type === 'generic_type' ? typeNode.child(0)?.text : typeNode.text; - if (typeName) ctx.calls.push({ name: typeName, line: node.startPosition.row + 1 }); + const typeName = resolveJavaTypeText(typeNode); + if (typeName) pushCall(ctx, node, typeName); +} + +/** + * Resolve a Java type node's text, unwrapping `generic_type` to its base name. + * Used wherever we need the bare type identifier (local var decls, object + * creation, parameter types). + */ +function resolveJavaTypeText(typeNode: TreeSitterNode): string | undefined { + return typeNode.type === 'generic_type' ? typeNode.child(0)?.text : typeNode.text; } const JAVA_PARENT_TYPES = [ @@ -300,31 +305,17 @@ function findJavaParentClass(node: TreeSitterNode): string | null { // ── Child extraction helpers ──────────────────────────────────────────────── +const JAVA_PARAM_TYPES = ['formal_parameter', 'spread_parameter'] as const; + function extractJavaParameters( paramListNode: TreeSitterNode | null, typeMap?: Map, ): SubDeclaration[] { - const params: SubDeclaration[] = []; - if (!paramListNode) return params; - for (let i = 0; i < paramListNode.childCount; i++) { - const param = paramListNode.child(i); - if (!param) continue; - if (param.type === 'formal_parameter' || param.type === 'spread_parameter') { - const nameNode = param.childForFieldName('name'); - if (nameNode) { - params.push({ name: nameNode.text, kind: 'parameter', line: param.startPosition.row + 1 }); - if (typeMap) { - const typeNode = param.childForFieldName('type'); - if (typeNode) { - const typeName = - typeNode.type === 'generic_type' ? typeNode.child(0)?.text : typeNode.text; - if (typeName) typeMap.set(nameNode.text, { type: typeName, confidence: 0.9 }); - } - } - } - } - } - return params; + return extractSimpleParameters(paramListNode, { + paramTypes: JAVA_PARAM_TYPES, + typeMap, + resolveType: resolveJavaTypeText, + }); } function extractClassFields(classNode: TreeSitterNode): SubDeclaration[] { @@ -350,7 +341,7 @@ function extractFieldDeclarators(member: TreeSitterNode, fields: SubDeclaration[ fields.push({ name: nameNode.text, kind: 'property', - line: member.startPosition.row + 1, + line: nodeStartLine(member), visibility: vis, }); } diff --git a/src/extractors/javascript.ts b/src/extractors/javascript.ts index fef61a0c3..0fc9e46f1 100644 --- a/src/extractors/javascript.ts +++ b/src/extractors/javascript.ts @@ -17,6 +17,7 @@ import { findParentNode, MAX_WALK_DEPTH, nodeEndLine, + nodeStartLine, setTypeMapEntry, } from './helpers.js'; @@ -99,7 +100,7 @@ function handleFnCapture(c: Record, definitions: Definit definitions.push({ name: c.fn_name!.text, kind: 'function', - line: c.fn_node!.startPosition.row + 1, + line: nodeStartLine(c.fn_node!), endLine: nodeEndLine(c.fn_node!), children: fnChildren.length > 0 ? fnChildren : undefined, }); @@ -108,7 +109,7 @@ function handleFnCapture(c: Record, definitions: Definit /** Handle variable_declarator with arrow_function / function_expression capture. */ function handleVarFnCapture(c: Record, definitions: Definition[]): void { const declNode = c.varfn_name!.parent?.parent; - const line = declNode ? declNode.startPosition.row + 1 : c.varfn_name!.startPosition.row + 1; + const line = declNode ? nodeStartLine(declNode) : nodeStartLine(c.varfn_name!); const varFnChildren = extractParameters(c.varfn_value!); definitions.push({ name: c.varfn_name!.text, @@ -126,7 +127,7 @@ function handleClassCapture( classes: ClassRelation[], ): void { const className = c.cls_name!.text; - const startLine = c.cls_node!.startPosition.row + 1; + const startLine = nodeStartLine(c.cls_node!); const clsChildren = extractClassProperties(c.cls_node!); definitions.push({ name: className, @@ -157,7 +158,7 @@ function handleMethodCapture(c: Record, definitions: Def definitions.push({ name: fullName, kind: 'method', - line: c.meth_node!.startPosition.row + 1, + line: nodeStartLine(c.meth_node!), endLine: nodeEndLine(c.meth_node!), children: methChildren.length > 0 ? methChildren : undefined, visibility: methVis, @@ -170,7 +171,7 @@ function handleExportCapture( exps: Export[], imports: Import[], ): void { - const exportLine = c.exp_node!.startPosition.row + 1; + const exportLine = nodeStartLine(c.exp_node!); const decl = c.exp_node!.childForFieldName('declaration'); if (decl) { const declType = decl.type; @@ -211,7 +212,7 @@ function handleInterfaceCapture( definitions.push({ name: ifaceName, kind: 'interface', - line: ifaceNode.startPosition.row + 1, + line: nodeStartLine(ifaceNode), endLine: nodeEndLine(ifaceNode), }); const body = @@ -226,7 +227,7 @@ function handleTypeCapture(c: Record, definitions: Defin definitions.push({ name: c.type_name!.text, kind: 'type', - line: typeNode.startPosition.row + 1, + line: nodeStartLine(typeNode), endLine: nodeEndLine(typeNode), }); } @@ -239,7 +240,7 @@ function handleImportCapture(c: Record, imports: Import[ imports.push({ source: modPath, names, - line: impNode.startPosition.row + 1, + line: nodeStartLine(impNode), typeOnly: isTypeOnly, }); } @@ -272,7 +273,7 @@ function dispatchQueryMatch( } else if (c.callfn_node) { calls.push({ name: c.callfn_name!.text, - line: c.callfn_node.startPosition.row + 1, + line: nodeStartLine(c.callfn_node), }); calls.push(...extractCallbackReferenceCalls(c.callfn_node)); } else if (c.callmem_node) { @@ -288,7 +289,7 @@ function dispatchQueryMatch( } else if (c.newfn_node) { calls.push({ name: c.newfn_name!.text, - line: c.newfn_node.startPosition.row + 1, + line: nodeStartLine(c.newfn_node), }); } else if (c.newmem_node) { const callInfo = extractCallInfo(c.newmem_fn!, c.newmem_node); @@ -411,7 +412,7 @@ function extractDestructuredBindingsWalk(node: TreeSitterNode, definitions: Defi if (nameN && nameN.type === 'object_pattern') { extractDestructuredBindings( nameN, - declNode.startPosition.row + 1, + nodeStartLine(declNode), nodeEndLine(declNode), definitions, ); @@ -445,7 +446,7 @@ function extractConstDeclarators(declNode: TreeSitterNode, definitions: Definiti definitions.push({ name: nameN.text, kind: 'constant', - line: declNode.startPosition.row + 1, + line: nodeStartLine(declNode), endLine: nodeEndLine(declNode), }); } @@ -470,12 +471,12 @@ function extractDynamicImportsWalk(node: TreeSitterNode, imports: Import[]): voi imports.push({ source: modPath, names, - line: node.startPosition.row + 1, + line: nodeStartLine(node), dynamicImport: true, }); } else { debug( - `Skipping non-static dynamic import() at line ${node.startPosition.row + 1} (template literal or variable)`, + `Skipping non-static dynamic import() at line ${nodeStartLine(node)} (template literal or variable)`, ); } } @@ -497,7 +498,7 @@ function handleCommonJSAssignment( const leftText = left.text; if (!leftText.startsWith('module.exports') && leftText !== 'exports') return; - const assignLine = node.startPosition.row + 1; + const assignLine = nodeStartLine(node); // module.exports = require("…") — direct re-export if (right.type === 'call_expression') { @@ -618,7 +619,7 @@ function handleFunctionDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: fnChildren.length > 0 ? fnChildren : undefined, }); @@ -629,7 +630,7 @@ function handleClassDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = node.childForFieldName('name'); if (!nameNode) return; const className = nameNode.text; - const startLine = node.startPosition.row + 1; + const startLine = nodeStartLine(node); const clsChildren = extractClassProperties(node); ctx.definitions.push({ name: className, @@ -661,7 +662,7 @@ function handleMethodDef(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: fullName, kind: 'method', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: methChildren.length > 0 ? methChildren : undefined, visibility: methVis, @@ -675,7 +676,7 @@ function handleInterfaceDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'interface', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); const body = @@ -693,7 +694,7 @@ function handleTypeAliasDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'type', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -751,7 +752,7 @@ function handleVariableDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameN.text, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(valueN), children: varFnChildren.length > 0 ? varFnChildren : undefined, }); @@ -759,7 +760,7 @@ function handleVariableDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameN.text, kind: 'constant', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } else if (isConst && nameN.type === 'object_pattern' && !hasFunctionScopeAncestor(node)) { @@ -772,7 +773,7 @@ function handleVariableDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { // handle_var_decl (Rust path) — skips bindings inside function bodies. extractDestructuredBindings( nameN, - node.startPosition.row + 1, + nodeStartLine(node), nodeEndLine(node), ctx.definitions, ); @@ -797,7 +798,7 @@ function handleEnumDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { enumChildren.push({ name: mName.text, kind: 'constant', - line: member.startPosition.row + 1, + line: nodeStartLine(member), }); } } @@ -806,7 +807,7 @@ function handleEnumDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'enum', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: enumChildren.length > 0 ? enumChildren : undefined, }); @@ -832,7 +833,7 @@ function handleNewExpr(node: TreeSitterNode, ctx: ExtractorOutput): void { const ctor = node.childForFieldName('constructor') || node.child(1); if (!ctor) return; if (ctor.type === 'identifier') { - ctx.calls.push({ name: ctor.text, line: node.startPosition.row + 1 }); + ctx.calls.push({ name: ctor.text, line: nodeStartLine(node) }); } else if (ctor.type === 'member_expression') { const callInfo = extractCallInfo(ctor, node); if (callInfo) ctx.calls.push(callInfo); @@ -847,10 +848,10 @@ function handleDynamicImportCall(node: TreeSitterNode, imports: Import[]): void if (strArg) { const modPath = strArg.text.replace(/['"]/g, ''); const names = extractDynamicImportNames(node); - imports.push({ source: modPath, names, line: node.startPosition.row + 1, dynamicImport: true }); + imports.push({ source: modPath, names, line: nodeStartLine(node), dynamicImport: true }); } else { debug( - `Skipping non-static dynamic import() at line ${node.startPosition.row + 1} (template literal or variable)`, + `Skipping non-static dynamic import() at line ${nodeStartLine(node)} (template literal or variable)`, ); } } @@ -864,14 +865,14 @@ function handleImportStmt(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.imports.push({ source: modPath, names, - line: node.startPosition.row + 1, + line: nodeStartLine(node), typeOnly: isTypeOnly, }); } } function handleExportStmt(node: TreeSitterNode, ctx: ExtractorOutput): void { - const exportLine = node.startPosition.row + 1; + const exportLine = nodeStartLine(node); const decl = node.childForFieldName('declaration'); if (decl) { const declType = decl.type; @@ -923,7 +924,7 @@ function extractParameters(node: TreeSitterNode): SubDeclaration[] { if (!child) continue; const t = child.type; if (t === 'identifier') { - params.push({ name: child.text, kind: 'parameter', line: child.startPosition.row + 1 }); + params.push({ name: child.text, kind: 'parameter', line: nodeStartLine(child) }); } else if ( t === 'required_parameter' || t === 'optional_parameter' || @@ -936,12 +937,12 @@ function extractParameters(node: TreeSitterNode): SubDeclaration[] { (nameNode.type === 'identifier' || nameNode.type === 'shorthand_property_identifier_pattern') ) { - params.push({ name: nameNode.text, kind: 'parameter', line: child.startPosition.row + 1 }); + params.push({ name: nameNode.text, kind: 'parameter', line: nodeStartLine(child) }); } } else if (t === 'rest_pattern' || t === 'rest_element') { const nameNode = child.child(1) || child.childForFieldName('name'); if (nameNode && nameNode.type === 'identifier') { - params.push({ name: nameNode.text, kind: 'parameter', line: child.startPosition.row + 1 }); + params.push({ name: nameNode.text, kind: 'parameter', line: nodeStartLine(child) }); } } } @@ -975,7 +976,7 @@ function extractClassProperties(classNode: TreeSitterNode): SubDeclaration[] { props.push({ name: nameNode.text, kind: 'property', - line: child.startPosition.row + 1, + line: nodeStartLine(child), visibility: vis, }); } @@ -1044,8 +1045,8 @@ function extractInterfaceMethods( definitions.push({ name: `${interfaceName}.${nameNode.text}`, kind: 'method', - line: child.startPosition.row + 1, - endLine: child.endPosition.row + 1, + line: nodeStartLine(child), + endLine: nodeEndLine(child), }); } } @@ -1216,7 +1217,7 @@ function extractReceiverName(objNode: TreeSitterNode | null): string | undefined function extractCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): Call | null { const fnType = fn.type; if (fnType === 'identifier') { - return { name: fn.text, line: callNode.startPosition.row + 1 }; + return { name: fn.text, line: nodeStartLine(callNode) }; } if (fnType === 'member_expression') { return extractMemberExprCallInfo(fn, callNode); @@ -1233,7 +1234,7 @@ function extractMemberExprCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode) const prop = fn.childForFieldName('property'); if (!prop) return null; - const callLine = callNode.startPosition.row + 1; + const callLine = nodeStartLine(callNode); const propText = prop.text; // .call()/.apply()/.bind() — dynamic invocation @@ -1272,7 +1273,7 @@ function extractSubscriptCallInfo(fn: TreeSitterNode, callNode: TreeSitterNode): const receiver = extractReceiverName(obj); return { name: methodName, - line: callNode.startPosition.row + 1, + line: nodeStartLine(callNode), dynamic: true, receiver, }; @@ -1435,7 +1436,7 @@ function extractCallbackReferenceCalls(callNode: TreeSitterNode): Call[] { } const result: Call[] = []; - const callLine = callNode.startPosition.row + 1; + const callLine = nodeStartLine(callNode); for (let i = 0; i < args.childCount; i++) { const child = args.child(i); @@ -1540,7 +1541,7 @@ function extractCallbackDefinition( return { name: `command:${firstWord}`, kind: 'function', - line: cb.startPosition.row + 1, + line: nodeStartLine(cb), endLine: nodeEndLine(cb), }; } @@ -1554,7 +1555,7 @@ function extractCallbackDefinition( return { name: `route:${method.toUpperCase()} ${strArg}`, kind: 'function', - line: cb.startPosition.row + 1, + line: nodeStartLine(cb), endLine: nodeEndLine(cb), }; } @@ -1568,7 +1569,7 @@ function extractCallbackDefinition( return { name: `event:${eventName}`, kind: 'function', - line: cb.startPosition.row + 1, + line: nodeStartLine(cb), endLine: nodeEndLine(cb), }; } diff --git a/src/extractors/julia.ts b/src/extractors/julia.ts index 7667ec95d..d412fecda 100644 --- a/src/extractors/julia.ts +++ b/src/extractors/julia.ts @@ -1,5 +1,5 @@ import type { ExtractorOutput, SubDeclaration, TreeSitterNode, TreeSitterTree } from '../types.js'; -import { findChild, nodeEndLine } from './helpers.js'; +import { findChild, nodeEndLine, nodeStartLine, pushCall, pushImport } from './helpers.js'; /** * Extract symbols from Julia files. @@ -76,7 +76,7 @@ function handleModuleDef(node: TreeSitterNode, ctx: ExtractorOutput): string | n ctx.definitions.push({ name: nameNode.text, kind: 'module', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); @@ -130,7 +130,7 @@ function handleFunctionDef( ctx.definitions.push({ name, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: params.length > 0 ? params : undefined, }); @@ -145,7 +145,7 @@ function handleFunctionDef( ctx.definitions.push({ name: qualifyName(nameNode.text, currentModule), kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -169,7 +169,7 @@ function handleAssignment( ctx.definitions.push({ name: qualifyName(funcNameNode.text, currentModule), kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: params.length > 0 ? params : undefined, }); @@ -253,14 +253,14 @@ function handleStructDef(node: TreeSitterNode, ctx: ExtractorOutput): void { children.push({ name: fieldName.text, kind: 'property', - line: child.startPosition.row + 1, + line: nodeStartLine(child), }); } } else if (child.type === 'identifier') { // Plain identifier fields (no type annotation) appear as direct // identifier children of struct_definition. The type_head is a // separate node so there is nothing to filter out here. - children.push({ name: child.text, kind: 'property', line: child.startPosition.row + 1 }); + children.push({ name: child.text, kind: 'property', line: nodeStartLine(child) }); } } @@ -268,14 +268,14 @@ function handleStructDef(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.classes.push({ name: structName, extends: supertypeNode.text, - line: node.startPosition.row + 1, + line: nodeStartLine(node), }); } ctx.definitions.push({ name: structName, kind: 'struct', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: children.length > 0 ? children : undefined, }); @@ -295,7 +295,7 @@ function handleAbstractDef(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: nameNode.text, kind: 'type', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -319,7 +319,7 @@ function handleMacroDef( ctx.definitions.push({ name, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -363,11 +363,10 @@ function handleImport(node: TreeSitterNode, ctx: ExtractorOutput): void { } if (source) { - ctx.imports.push({ - source, - names: names.length > 0 ? names : [source], - line: node.startPosition.row + 1, - }); + // pushImport falls back to source basename for empty `names`. Julia module + // sources have no `/` separator, so the basename equals `source` — matching + // the previous explicit `[source]` fallback. + pushImport(ctx, node, source, names); } } @@ -388,21 +387,26 @@ function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { if (!funcNode) return; if (funcNode.type === 'identifier') { - ctx.calls.push({ name: funcNode.text, line: node.startPosition.row + 1 }); + pushCall(ctx, node, funcNode.text); } else if (funcNode.type === 'field_expression' || funcNode.type === 'scoped_identifier') { const parts = funcNode.text.split('.'); if (parts.length >= 2) { - ctx.calls.push({ - name: parts[parts.length - 1]!, + pushCall(ctx, node, parts[parts.length - 1]!, { receiver: parts.slice(0, -1).join('.'), - line: node.startPosition.row + 1, }); } else { - ctx.calls.push({ name: funcNode.text, line: node.startPosition.row + 1 }); + pushCall(ctx, node, funcNode.text); } } } +const JULIA_PARAM_WRAPPER_TYPES = new Set([ + 'typed_parameter', + 'typed_expression', + 'optional_parameter', + 'default_parameter', +]); + function extractJuliaParams(callExpr: TreeSitterNode): SubDeclaration[] { const params: SubDeclaration[] = []; const argList = findChild(callExpr, 'argument_list') || findChild(callExpr, 'tuple_expression'); @@ -412,25 +416,14 @@ function extractJuliaParams(callExpr: TreeSitterNode): SubDeclaration[] { const child = argList.child(i); if (!child) continue; if (child.type === 'identifier') { - params.push({ name: child.text, kind: 'parameter', line: child.startPosition.row + 1 }); - } - if (child.type === 'typed_parameter' || child.type === 'typed_expression') { - const nameNode = findChild(child, 'identifier'); - if (nameNode) { - params.push({ - name: nameNode.text, - kind: 'parameter', - line: child.startPosition.row + 1, - }); - } - } - if (child.type === 'optional_parameter' || child.type === 'default_parameter') { + params.push({ name: child.text, kind: 'parameter', line: nodeStartLine(child) }); + } else if (JULIA_PARAM_WRAPPER_TYPES.has(child.type)) { const nameNode = findChild(child, 'identifier'); if (nameNode) { params.push({ name: nameNode.text, kind: 'parameter', - line: child.startPosition.row + 1, + line: nodeStartLine(child), }); } } diff --git a/src/extractors/r.ts b/src/extractors/r.ts index a6edac135..ef0a863e0 100644 --- a/src/extractors/r.ts +++ b/src/extractors/r.ts @@ -1,5 +1,13 @@ import type { ExtractorOutput, SubDeclaration, TreeSitterNode, TreeSitterTree } from '../types.js'; -import { findChild, nodeEndLine } from './helpers.js'; +import { + findChild, + findFirstChildOfTypes, + nodeEndLine, + nodeStartLine, + pushCall, + pushImport, + stripQuotes, +} from './helpers.js'; /** * Extract symbols from R files. @@ -58,7 +66,7 @@ function handleBinaryOp(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: lhs.text, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: params.length > 0 ? params : undefined, }); @@ -68,7 +76,7 @@ function handleBinaryOp(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: lhs.text, kind: 'variable', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -87,14 +95,14 @@ function extractRParams(funcDef: TreeSitterNode): SubDeclaration[] { // parameter node has name and possibly default value const nameNode = child.childForFieldName('name') || findChild(child, 'identifier'); if (nameNode) { - params.push({ name: nameNode.text, kind: 'parameter', line: child.startPosition.row + 1 }); + params.push({ name: nameNode.text, kind: 'parameter', line: nodeStartLine(child) }); } else if (child.text && child.text !== ',' && child.text !== '(' && child.text !== ')') { // Some grammars have the param as plain text - params.push({ name: child.text, kind: 'parameter', line: child.startPosition.row + 1 }); + params.push({ name: child.text, kind: 'parameter', line: nodeStartLine(child) }); } } if (child.type === 'identifier') { - params.push({ name: child.text, kind: 'parameter', line: child.startPosition.row + 1 }); + params.push({ name: child.text, kind: 'parameter', line: nodeStartLine(child) }); } } return params; @@ -137,15 +145,13 @@ function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { // Regular call if (funcNode.type === 'identifier') { - ctx.calls.push({ name: funcName, line: node.startPosition.row + 1 }); + pushCall(ctx, node, funcName); } else if (funcNode.type === 'namespace_operator') { // pkg::func const parts = funcName.split('::'); if (parts.length >= 2) { - ctx.calls.push({ - name: parts[parts.length - 1]!, + pushCall(ctx, node, parts[parts.length - 1]!, { receiver: parts.slice(0, -1).join('::'), - line: node.startPosition.row + 1, }); } } @@ -164,20 +170,12 @@ function handleLibraryCall(node: TreeSitterNode, ctx: ExtractorOutput): void { const arg = child.child(j); if (!arg) continue; if (arg.type === 'identifier') { - ctx.imports.push({ - source: arg.text, - names: [arg.text], - line: node.startPosition.row + 1, - }); + pushImport(ctx, node, arg.text, [arg.text]); return; } if (arg.type === 'string' || arg.type === 'string_content') { - const text = arg.text.replace(/^["']|["']$/g, ''); - ctx.imports.push({ - source: text, - names: [text], - line: node.startPosition.row + 1, - }); + const text = stripQuotes(arg.text); + pushImport(ctx, node, text, [text]); return; } // Argument might be wrapped @@ -202,12 +200,8 @@ function handleLibraryCall(node: TreeSitterNode, ctx: ExtractorOutput): void { } } if (pick) { - const text = pick.text.replace(/^["']|["']$/g, ''); - ctx.imports.push({ - source: text, - names: [text], - line: node.startPosition.row + 1, - }); + const text = stripQuotes(pick.text); + pushImport(ctx, node, text, [text]); return; } } @@ -220,11 +214,7 @@ function handleSourceCall(node: TreeSitterNode, ctx: ExtractorOutput): void { // source() only accepts string literals — `source(varname)` is not an import. const path = firstStringArgument(node); if (path === null) return; - ctx.imports.push({ - source: path, - names: ['source'], - line: node.startPosition.row + 1, - }); + pushImport(ctx, node, path, ['source']); } function handleSetClass(node: TreeSitterNode, ctx: ExtractorOutput): void { @@ -233,7 +223,7 @@ function handleSetClass(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name, kind: 'class', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -244,7 +234,7 @@ function handleSetGeneric(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name, kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), }); } @@ -258,7 +248,7 @@ function handleSetGeneric(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleSetMethod(node: TreeSitterNode, ctx: ExtractorOutput): void { const name = firstStringArgument(node); if (name === null) return; - ctx.calls.push({ name, line: node.startPosition.row + 1 }); + pushCall(ctx, node, name); } // tree-sitter-r wraps each positional argument in an `argument` node that @@ -266,28 +256,20 @@ function handleSetMethod(node: TreeSitterNode, ctx: ExtractorOutput): void { // must be unwrapped — checking `child.type === 'string'` directly misses it. // Mirrors `first_argument_value` in the Rust extractor for parity. function firstStringArgument(node: TreeSitterNode): string | null { - for (let i = 0; i < node.childCount; i++) { - const child = node.child(i); - if (!child || child.type !== 'arguments') continue; - for (let j = 0; j < child.childCount; j++) { - const arg = child.child(j); - if (!arg) continue; - if (arg.type === 'string') { - return stripQuotes(arg.text); - } - if (arg.type === 'argument') { - const valueNode = arg.childForFieldName('value'); - if (valueNode && valueNode.type === 'string') return stripQuotes(valueNode.text); - for (let k = 0; k < arg.childCount; k++) { - const inner = arg.child(k); - if (inner && inner.type === 'string') return stripQuotes(inner.text); - } - } + const args = findFirstChildOfTypes(node, ['arguments']); + if (!args) return null; + for (let j = 0; j < args.childCount; j++) { + const arg = args.child(j); + if (!arg) continue; + if (arg.type === 'string') { + return stripQuotes(arg.text); + } + if (arg.type === 'argument') { + const valueNode = arg.childForFieldName('value'); + if (valueNode && valueNode.type === 'string') return stripQuotes(valueNode.text); + const innerStr = findFirstChildOfTypes(arg, ['string']); + if (innerStr) return stripQuotes(innerStr.text); } } return null; } - -function stripQuotes(text: string): string { - return text.replace(/^["']|["']$/g, ''); -} diff --git a/src/extractors/solidity.ts b/src/extractors/solidity.ts index c68043966..8626b29e0 100644 --- a/src/extractors/solidity.ts +++ b/src/extractors/solidity.ts @@ -1,15 +1,14 @@ -import type { - Call, - ExtractorOutput, - SubDeclaration, - TreeSitterNode, - TreeSitterTree, -} from '../types.js'; +import type { ExtractorOutput, SubDeclaration, TreeSitterNode, TreeSitterTree } from '../types.js'; import { extractModifierVisibility, + extractSimpleParameters, findChild, + findFirstChildOfTypes, findParentNode, nodeEndLine, + nodeStartLine, + pushCall, + pushImport, stripQuotes, } from './helpers.js'; @@ -103,7 +102,7 @@ function handleContractDecl( ctx.definitions.push({ name, kind, - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: members.length > 0 ? members : undefined, }); @@ -125,7 +124,7 @@ function extractContractMembers(body: TreeSitterNode): SubDeclaration[] { /** Map a single contract body child to a SubDeclaration, or null if not a recognized member. */ function extractContractMember(child: TreeSitterNode): SubDeclaration | null { - const line = child.startPosition.row + 1; + const line = nodeStartLine(child); switch (child.type) { case 'function_definition': { const fnName = child.childForFieldName('name'); @@ -172,7 +171,7 @@ function extractInheritance(node: TreeSitterNode, name: string, ctx: ExtractorOu const child = inheritance.child(j); if (!child) continue; if (child.type === 'user_defined_type' || child.type === 'identifier') { - ctx.classes.push({ name, extends: child.text, line: node.startPosition.row + 1 }); + ctx.classes.push({ name, extends: child.text, line: nodeStartLine(node) }); } } } @@ -191,19 +190,16 @@ function handleStructDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { members.push({ name: memberName.text, kind: 'property', - line: child.startPosition.row + 1, + line: nodeStartLine(child), }); } } } - const parent = findParentNode(node, SOL_PARENT_TYPES); - const fullName = parent ? `${parent}.${nameNode.text}` : nameNode.text; - ctx.definitions.push({ - name: fullName, + name: qualifyWithParent(node, nameNode.text), kind: 'struct', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: members.length > 0 ? members : undefined, }); @@ -217,17 +213,14 @@ function handleEnumDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (child && child.type === 'enum_value') { - members.push({ name: child.text, kind: 'constant', line: child.startPosition.row + 1 }); + members.push({ name: child.text, kind: 'constant', line: nodeStartLine(child) }); } } - const parent = findParentNode(node, SOL_PARENT_TYPES); - const fullName = parent ? `${parent}.${nameNode.text}` : nameNode.text; - ctx.definitions.push({ - name: fullName, + name: qualifyWithParent(node, nameNode.text), kind: 'enum', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: members.length > 0 ? members : undefined, }); @@ -244,7 +237,7 @@ function handleFunctionDef(node: TreeSitterNode, ctx: ExtractorOutput): void { ctx.definitions.push({ name: fullName, kind, - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), children: params.length > 0 ? params : undefined, visibility: extractSolVisibility(node), @@ -254,13 +247,10 @@ function handleFunctionDef(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleModifierDef(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = node.childForFieldName('name'); if (!nameNode) return; - const parent = findParentNode(node, SOL_PARENT_TYPES); - const fullName = parent ? `${parent}.${nameNode.text}` : nameNode.text; - ctx.definitions.push({ - name: fullName, + name: qualifyWithParent(node, nameNode.text), kind: 'function', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), decorators: ['modifier'], }); @@ -269,13 +259,10 @@ function handleModifierDef(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleEventDef(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = node.childForFieldName('name'); if (!nameNode) return; - const parent = findParentNode(node, SOL_PARENT_TYPES); - const fullName = parent ? `${parent}.${nameNode.text}` : nameNode.text; - ctx.definitions.push({ - name: fullName, + name: qualifyWithParent(node, nameNode.text), kind: 'type', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), decorators: ['event'], }); @@ -284,13 +271,10 @@ function handleEventDef(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleErrorDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = node.childForFieldName('name'); if (!nameNode) return; - const parent = findParentNode(node, SOL_PARENT_TYPES); - const fullName = parent ? `${parent}.${nameNode.text}` : nameNode.text; - ctx.definitions.push({ - name: fullName, + name: qualifyWithParent(node, nameNode.text), kind: 'type', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), decorators: ['error'], }); @@ -299,18 +283,21 @@ function handleErrorDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleStateVarDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { const nameNode = node.childForFieldName('name'); if (!nameNode) return; - const parent = findParentNode(node, SOL_PARENT_TYPES); - const fullName = parent ? `${parent}.${nameNode.text}` : nameNode.text; - ctx.definitions.push({ - name: fullName, + name: qualifyWithParent(node, nameNode.text), kind: 'variable', - line: node.startPosition.row + 1, + line: nodeStartLine(node), endLine: nodeEndLine(node), visibility: extractSolVisibility(node), }); } +/** Qualify `name` with the nearest contract/interface/library, if any. */ +function qualifyWithParent(node: TreeSitterNode, name: string): string { + const parent = findParentNode(node, SOL_PARENT_TYPES); + return parent ? `${parent}.${name}` : name; +} + function handleImportDirective(node: TreeSitterNode, ctx: ExtractorOutput): void { // import "path"; or import { X } from "path"; or import "path" as Alias; for (let i = 0; i < node.childCount; i++) { @@ -328,22 +315,17 @@ function handleImportDirective(node: TreeSitterNode, ctx: ExtractorOutput): void if (id) names.push(id.text); } } - ctx.imports.push({ - source, - names: names.length > 0 ? names : ['*'], - line: node.startPosition.row + 1, - }); + // Preserve the explicit `['*']` fallback — pushImport's default uses the + // source basename, but Solidity's convention here is to mark unqualified + // imports as `*`. + pushImport(ctx, node, source, names.length > 0 ? names : ['*']); return; } // source_import: handles `import * as X from "path"` if (child.type === 'source_import' || child.type === 'import_clause') { - const strNode = findChild(child, 'string') || findChild(child, 'string_literal'); + const strNode = findFirstChildOfTypes(child, ['string', 'string_literal']); if (strNode) { - ctx.imports.push({ - source: stripQuotes(strNode.text), - names: ['*'], - line: node.startPosition.row + 1, - }); + pushImport(ctx, node, stripQuotes(strNode.text), ['*']); return; } } @@ -354,35 +336,25 @@ function handleCallExpression(node: TreeSitterNode, ctx: ExtractorOutput): void const funcNode = node.childForFieldName('function') || node.childForFieldName('callee'); if (!funcNode) return; - const call: Call = { name: '', line: node.startPosition.row + 1 }; + let name = ''; + let receiver: string | undefined; if (funcNode.type === 'member_expression' || funcNode.type === 'member_access') { const prop = funcNode.childForFieldName('property') || funcNode.childForFieldName('member'); const obj = funcNode.childForFieldName('object') || funcNode.childForFieldName('expression'); - if (prop) call.name = prop.text; - if (obj) call.receiver = obj.text; + if (prop) name = prop.text; + if (obj) receiver = obj.text; } else { - call.name = funcNode.text; + name = funcNode.text; } - if (call.name) ctx.calls.push(call); + if (name) pushCall(ctx, node, name, receiver !== undefined ? { receiver } : {}); } // ── Helpers ──────────────────────────────────────────────────────────────── function extractSolParams(funcNode: TreeSitterNode): SubDeclaration[] { - const params: SubDeclaration[] = []; const paramList = funcNode.childForFieldName('parameters') || findChild(funcNode, 'parameter_list'); - if (!paramList) return params; - - for (let i = 0; i < paramList.childCount; i++) { - const param = paramList.child(i); - if (!param || param.type !== 'parameter') continue; - const nameNode = param.childForFieldName('name'); - if (nameNode) { - params.push({ name: nameNode.text, kind: 'parameter', line: param.startPosition.row + 1 }); - } - } - return params; + return extractSimpleParameters(paramList, { paramTypes: ['parameter'] }); } function extractSolVisibility( From 5abe6ad55f1d1e1c4da06bec15f0aba8637ff537 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 11:38:46 -0600 Subject: [PATCH 04/27] refactor(extractors): break elixir param/map binding cycle Convert collectElixirParamIdentifiers from mutual-recursion with collectElixirMapBindings into a single iterative worklist traversal. Map/list/tuple/binary-operator dispatch is now done via three leaf helpers that push child nodes onto the worklist instead of calling back into the main function. This removes the function-level cycle flagged by codegraph (9 -> 8 cycles) without changing extractor semantics. docs check acknowledged: internal refactor only. --- src/extractors/elixir.ts | 146 ++++++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 63 deletions(-) diff --git a/src/extractors/elixir.ts b/src/extractors/elixir.ts index cb1f8ff05..1b547645c 100644 --- a/src/extractors/elixir.ts +++ b/src/extractors/elixir.ts @@ -197,74 +197,94 @@ function extractElixirParams(defCallNode: TreeSitterNode): SubDeclaration[] { } /** - * Recursively walk a parameter pattern and emit each bound identifier as a - * `parameter` child. Handles bare identifiers, default-value `a \\ default`, - * list-cons `[head | tail]`, list `[a, b, c]`, tuple `{x, y}`, and - * map / struct destructuring (`%{k: v}`, `%Foo{k: v}`). + * Walk a parameter pattern and emit each bound identifier as a `parameter` + * child. Handles bare identifiers, default-value `a \\ default`, list-cons + * `[head | tail]`, list `[a, b, c]`, tuple `{x, y}`, and map / struct + * destructuring (`%{k: v}`, `%Foo{k: v}`). + * + * Implemented as an iterative worklist (rather than recursion + helpers) so + * the call graph has no function-level cycle: only one function performs the + * traversal and it invokes only leaf helpers (`pushSubNodes`, `pushMapValues`). */ -function collectElixirParamIdentifiers(node: TreeSitterNode, out: SubDeclaration[]): void { - switch (node.type) { - case 'identifier': - out.push({ name: node.text, kind: 'parameter', line: node.startPosition.row + 1 }); - return; - case 'binary_operator': { - // `name \\ default` (default-value) binds the left operand only. - // `head | tail` (list-cons, appears inside a `list` pattern) binds both operands. - const op = node.child(1); - if (!op) return; - if (op.type === '\\\\') { - const left = node.child(0); - if (left) collectElixirParamIdentifiers(left, out); - return; - } - if (op.type === '|') { - const left = node.child(0); - const right = node.child(2); - if (left) collectElixirParamIdentifiers(left, out); - if (right) collectElixirParamIdentifiers(right, out); - return; - } - return; +function collectElixirParamIdentifiers(root: TreeSitterNode, out: SubDeclaration[]): void { + const stack: TreeSitterNode[] = [root]; + while (stack.length > 0) { + const node = stack.pop(); + if (!node) continue; + switch (node.type) { + case 'identifier': + out.push({ name: node.text, kind: 'parameter', line: node.startPosition.row + 1 }); + break; + case 'binary_operator': + pushElixirBinaryOperatorOperands(node, stack); + break; + case 'list': + case 'tuple': + pushElixirSequenceItems(node, stack); + break; + case 'map': + pushElixirMapValues(node, stack); + break; } - case 'list': - // `[a, b, c]` or `[head | tail]` — walk children, skipping punctuation. The - // `|` cons case is handled by the `binary_operator` arm when we recurse. - for (let i = 0; i < node.childCount; i++) { - const c = node.child(i); - if (!c || c.type === '[' || c.type === ']' || c.type === ',') continue; - collectElixirParamIdentifiers(c, out); - } - return; - case 'tuple': - for (let i = 0; i < node.childCount; i++) { - const c = node.child(i); - if (!c || c.type === '{' || c.type === '}' || c.type === ',') continue; - collectElixirParamIdentifiers(c, out); - } - return; - case 'map': - // `%{k: v}` or `%Foo{k: v}` — walk map_content > keywords > pair and emit each - // pair's value side (the bound name). The struct alias (`Foo`) is a type, not a - // bound identifier, so the leading `struct` child is intentionally skipped. - for (let i = 0; i < node.childCount; i++) { - const c = node.child(i); - if (c && c.type === 'map_content') collectElixirMapBindings(c, out); - } - return; } } -function collectElixirMapBindings(content: TreeSitterNode, out: SubDeclaration[]): void { - for (let i = 0; i < content.childCount; i++) { - const kws = content.child(i); - if (!kws || kws.type !== 'keywords') continue; - for (let j = 0; j < kws.childCount; j++) { - const pair = kws.child(j); - if (!pair || pair.type !== 'pair') continue; - for (let k = 0; k < pair.childCount; k++) { - const part = pair.child(k); - if (!part || part.type === 'keyword') continue; - collectElixirParamIdentifiers(part, out); +/** + * Push the binding-relevant operands of a `binary_operator` parameter onto the + * worklist: + * - `name \\ default` (default-value) binds the left operand only. + * - `head | tail` (list-cons, appears inside a `list` pattern) binds both. + */ +function pushElixirBinaryOperatorOperands(node: TreeSitterNode, stack: TreeSitterNode[]): void { + const op = node.child(1); + if (!op) return; + if (op.type === '\\\\') { + const left = node.child(0); + if (left) stack.push(left); + return; + } + if (op.type === '|') { + const right = node.child(2); + const left = node.child(0); + if (right) stack.push(right); + if (left) stack.push(left); + } +} + +/** + * Push the binding-relevant elements of a `list` or `tuple` parameter onto + * the worklist, skipping punctuation tokens. + */ +function pushElixirSequenceItems(node: TreeSitterNode, stack: TreeSitterNode[]): void { + for (let i = 0; i < node.childCount; i++) { + const c = node.child(i); + if (!c) continue; + const t = c.type; + if (t === '[' || t === ']' || t === '{' || t === '}' || t === ',') continue; + stack.push(c); + } +} + +/** + * Push the value side of every pair in a `map` or `%Foo{...}` parameter onto + * the worklist. The struct alias (`Foo`) is a type, not a bound identifier, so + * the leading `struct` child is intentionally skipped. + */ +function pushElixirMapValues(node: TreeSitterNode, stack: TreeSitterNode[]): void { + for (let i = 0; i < node.childCount; i++) { + const content = node.child(i); + if (!content || content.type !== 'map_content') continue; + for (let j = 0; j < content.childCount; j++) { + const kws = content.child(j); + if (!kws || kws.type !== 'keywords') continue; + for (let k = 0; k < kws.childCount; k++) { + const pair = kws.child(k); + if (!pair || pair.type !== 'pair') continue; + for (let p = 0; p < pair.childCount; p++) { + const part = pair.child(p); + if (!part || part.type === 'keyword') continue; + stack.push(part); + } } } } From 0d687c4f0741a040412f3fd18d870bde60d90c0f Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 11:52:28 -0600 Subject: [PATCH 05/27] refactor(extractors-rs): extend shared helpers for identifier and symbol collection --- .../codegraph-core/src/extractors/helpers.rs | 289 +++++++++++++++++- 1 file changed, 288 insertions(+), 1 deletion(-) diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index 7ae7b4bf9..4ee3666c2 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -1,4 +1,4 @@ -use crate::types::{AstNode, Definition, FileSymbols}; +use crate::types::{AstNode, Call, Definition, FileSymbols, Import, TypeMapEntry}; use tree_sitter::Node; // Re-export so extractors that `use super::helpers::*` still see it. @@ -40,6 +40,51 @@ pub fn find_child<'a>(node: &Node<'a>, kind: &str) -> Option> { None } +/// Find the first child whose type is in `kinds`. Useful when several +/// grammar variants name the same conceptual node differently (e.g. +/// `string` vs `string_literal`). Returns the first match in document +/// order, or `None`. +/// +/// Mirrors `findFirstChildOfTypes` in `src/extractors/helpers.ts`. +pub fn find_first_child_of_types<'a>(node: &Node<'a>, kinds: &[&str]) -> Option> { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if kinds.contains(&child.kind()) { + return Some(child); + } + } + } + None +} + +/// Common punctuation tokens — handy as a `skip_kinds` set for +/// [`iter_children`]. Mirrors `PUNCTUATION_TOKENS` in +/// `src/extractors/helpers.ts`. +pub const PUNCTUATION_TOKENS: &[&str] = &[ + ",", ";", "(", ")", "[", "]", "{", "}", ":", ".", +]; + +/// Iterate the direct children of `node` in document order, skipping +/// nulls and tokens whose `kind()` is in `skip_kinds`. Mirrors the +/// common `for i in 0..node.child_count() { let c = node.child(i); ... }` +/// idiom while letting callers filter out grammar punctuation +/// (`,`, `(`, `{`, etc.). +/// +/// Mirrors `iterChildren` in `src/extractors/helpers.ts`. +pub fn iter_children<'a>( + node: &'a Node<'a>, + skip_kinds: &'a [&'a str], +) -> impl Iterator> + 'a { + (0..node.child_count()).filter_map(move |i| { + let child = node.child(i)?; + if skip_kinds.contains(&child.kind()) { + None + } else { + Some(child) + } + }) +} + /// Find a parent of a given type, walking up the tree. pub fn find_parent_of_type<'a>(node: &Node<'a>, kind: &str) -> Option> { let mut current = node.parent(); @@ -748,3 +793,245 @@ fn extract_child_expression_text(node: &Node, source: &[u8]) -> Option { } Some(truncate(node_text(node, source), AST_TEXT_MAX)) } + +// ── Output-push helpers ──────────────────────────────────────────────────── +// +// Most extractors finish with `symbols.calls.push(Call { name, line: start_line(node), ... })` +// or `symbols.imports.push(Import::new(source, names, start_line(node)))`. Centralising +// the construction keeps `line` derivation consistent and removes the many +// hand-rolled `start_position().row + 1` literals scattered across language extractors. + +/// Append a [`Call`] to `symbols`, using `start_line(node)` for the line and +/// the given optional `receiver`/`dynamic` flags. Skips no-op pushes when +/// `name` is empty. +/// +/// Mirrors `pushCall` in `src/extractors/helpers.ts`. +pub fn push_call( + symbols: &mut FileSymbols, + node: &Node, + name: impl Into, + receiver: Option, + dynamic: Option, +) { + let name = name.into(); + if name.is_empty() { + return; + } + symbols.calls.push(Call { + name, + line: start_line(node), + dynamic, + receiver, + }); +} + +/// Append a simple [`Call`] (no receiver, no dynamic flag) to `symbols`. +/// Convenience wrapper around [`push_call`] for the common case shared by +/// most C-family and procedural-language extractors. +pub fn push_simple_call(symbols: &mut FileSymbols, node: &Node, name: impl Into) { + push_call(symbols, node, name, None, None); +} + +/// Append an [`Import`] to `symbols`, using `start_line(node)` for the +/// line. If `names` is empty, the last `/`-segment of `source` is used as +/// a single-name fallback — matching the convention used by gleam, julia, +/// and similar module-path imports. +/// +/// The `customize` closure receives a mutable reference to the freshly +/// constructed `Import` so callers can flip language-specific flags +/// (`c_include`, `python_import`, `bash_source`, etc.) before the entry +/// is pushed. Pass `|_| {}` when no flags are needed. +/// +/// Mirrors `pushImport` in `src/extractors/helpers.ts`. +pub fn push_import( + symbols: &mut FileSymbols, + node: &Node, + source: impl Into, + names: Vec, + customize: F, +) where + F: FnOnce(&mut Import), +{ + let source = source.into(); + if source.is_empty() { + return; + } + let resolved_names = if names.is_empty() { + let fallback = source.rsplit('/').next().unwrap_or(source.as_str()); + vec![fallback.to_string()] + } else { + names + }; + let mut imp = Import::new(source, resolved_names, start_line(node)); + customize(&mut imp); + symbols.imports.push(imp); +} + +// ── Parameter extraction ─────────────────────────────────────────────────── + +/// Configuration for [`extract_simple_parameters`]. +/// +/// Collapses the boilerplate in `extract_*_params` helpers across +/// java / julia / gleam / solidity / r / etc. — each one walks a +/// parameter list, matches a parameter-node kind, reads the `name` +/// field, and pushes a [`Definition`] with `kind: "parameter"`. +pub struct ExtractParametersOptions<'a> { + /// Tree-sitter node kinds that mark a single parameter node + /// (e.g. `formal_parameter`, `parameter`). + pub param_kinds: &'a [&'a str], + /// Field name on each parameter that holds the bound identifier. + /// Defaults to `Some("name")`. Pass `None` to use the parameter + /// node itself when its kind is in `param_kinds` and it has no + /// `name` field (e.g. R's bare `identifier`). + pub name_field: Option<&'a str>, + /// If true, when `name_field` lookup fails fall back to the first + /// `identifier` child of the parameter. Useful for gleam / + /// solidity-style grammars. + pub fallback_to_identifier: bool, +} + +impl<'a> Default for ExtractParametersOptions<'a> { + fn default() -> Self { + Self { + param_kinds: &[], + name_field: Some("name"), + fallback_to_identifier: false, + } + } +} + +/// Resolve the identifier node that names a parameter. Used by +/// [`extract_simple_parameters`]; exposed so language-specific +/// extractors can reuse the same lookup logic in custom loops. +/// +/// Mirrors `resolveParamName` in `src/extractors/helpers.ts`. +pub fn resolve_param_name<'a>( + param_node: &Node<'a>, + name_field: Option<&str>, + fallback_to_identifier: bool, +) -> Option> { + let Some(field) = name_field else { + return Some(*param_node); + }; + if let Some(named) = param_node.child_by_field_name(field) { + return Some(named); + } + if fallback_to_identifier { + return find_child(param_node, "identifier"); + } + None +} + +/// Extract parameters from a parameter-list node using a uniform +/// pattern. Returns an empty vec when `param_list` is `None`. +/// +/// Mirrors `extractSimpleParameters` in `src/extractors/helpers.ts`. +pub fn extract_simple_parameters( + param_list: Option, + source: &[u8], + options: &ExtractParametersOptions, +) -> Vec { + let mut params = Vec::new(); + let Some(param_list) = param_list else { + return params; + }; + for i in 0..param_list.child_count() { + let Some(child) = param_list.child(i) else { continue }; + if !options.param_kinds.contains(&child.kind()) { + continue; + } + let Some(name_node) = resolve_param_name( + &child, + options.name_field, + options.fallback_to_identifier, + ) else { + continue; + }; + params.push(child_def( + node_text(&name_node, source).to_string(), + "parameter", + start_line(&child), + )); + } + params +} + +// ── Type-map helpers ─────────────────────────────────────────────────────── + +/// Record a parameter name → type binding in the type-map sink, using +/// the default confidence of `0.9` shared by every Rust extractor. +pub fn push_type_map_entry( + symbols: &mut FileSymbols, + name: impl Into, + type_name: impl Into, +) { + let name = name.into(); + if name.is_empty() { + return; + } + symbols.type_map.push(TypeMapEntry { + name, + type_name: type_name.into(), + confidence: 0.9, + }); +} + +/// C-family `declaration` / `parameter_declaration` type-map matcher. +/// +/// The cpp / cuda / c extractors all emit verbatim copies of the same +/// `match_*_type_map` walker — they share node kinds (`declaration`, +/// `init_declarator`, `parameter_declaration`) and only differ in the +/// per-language declarator-unwrap helper. This helper centralises the +/// shared walker; callers supply the language's `unwrap_declarator` +/// closure (e.g. `unwrap_cpp_declarator`). +/// +/// Returns whether the node was a relevant C-family type-map node. The +/// generic [`walk_tree`] match-fn signature still wraps this helper so +/// the helper can be called from a tiny per-language adapter. +pub fn match_c_family_type_map( + node: &Node, + source: &[u8], + symbols: &mut FileSymbols, + mut unwrap_declarator: F, +) -> bool +where + F: FnMut(&Node, &[u8]) -> String, +{ + match node.kind() { + "declaration" => { + let Some(type_node) = node.child_by_field_name("type") else { + return false; + }; + let type_name = node_text(&type_node, source).to_string(); + for i in 0..node.child_count() { + let Some(child) = node.child(i) else { continue }; + let kind = child.kind(); + if kind != "init_declarator" && kind != "identifier" { + continue; + } + let name_node = if kind == "init_declarator" { + child.child_by_field_name("declarator") + } else { + Some(child) + }; + let Some(name_node) = name_node else { continue }; + let final_name = unwrap_declarator(&name_node, source); + push_type_map_entry(symbols, final_name, type_name.clone()); + } + true + } + "parameter_declaration" => { + let Some(type_node) = node.child_by_field_name("type") else { + return false; + }; + let Some(decl) = node.child_by_field_name("declarator") else { + return false; + }; + let name = unwrap_declarator(&decl, source); + let type_name = node_text(&type_node, source).to_string(); + push_type_map_entry(symbols, name, type_name); + true + } + _ => false, + } +} From f10fcab4859615d002240ca3580006734b49ff7e Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:03:31 -0600 Subject: [PATCH 06/27] refactor(extractors-rs): adopt shared helpers across language extractors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5 of the Rust extractor refactor plan (sync.json cluster 2). Adopts the helpers extended in 0d687c4 (push_call, push_simple_call, push_import, push_type_map_entry, extract_simple_parameters, match_c_family_type_map) across eight language extractors: - cpp.rs: collapse match_cpp_type_map to a one-line delegate of match_c_family_type_map; use push_import/push_simple_call/push_call for include and call sites - cuda.rs: same delegation as cpp.rs; use push_import/push_simple_call/ push_call across include and call_expression handlers - java.rs: use push_type_map_entry for local-variable / formal-parameter bindings; use push_call/push_simple_call for method invocation and object creation; collapse extract_java_parameters to a one-shot extract_simple_parameters call; use push_import for import declaration - javascript.rs: use push_simple_call for new_expression identifier branch; use push_type_map_entry for the confidence-0.9 type entries - julia.rs: use push_simple_call/push_call across identifier and field_expression / scoped_identifier call branches - objc.rs: use push_import for at_import; use push_call for c-call and message-expression handlers (drops redundant is_empty guards) - r_lang.rs: use push_simple_call/push_call across identifier and namespace_operator call branches; use push_import for library/source - solidity.rs: use push_call (drops redundant guard) for call sites; collapse extract_sol_params to a one-shot extract_simple_parameters Net: -207 lines across 8 files, no behavior change. cargo check clean, 324 rust unit tests pass. Pre-existing test failure: tests/engines/parity.test.ts has two failing elixir cases unrelated to this commit (filed as #1227 — regression from commit 5abe6ad in Phase 3). --- crates/codegraph-core/src/extractors/cpp.rs | 73 +++--------------- crates/codegraph-core/src/extractors/cuda.rs | 76 +++---------------- crates/codegraph-core/src/extractors/java.rs | 73 ++++++++---------- .../src/extractors/javascript.rs | 23 ++---- crates/codegraph-core/src/extractors/julia.rs | 21 +---- crates/codegraph-core/src/extractors/objc.rs | 26 +------ .../codegraph-core/src/extractors/r_lang.rs | 33 ++------ .../codegraph-core/src/extractors/solidity.rs | 36 +++------ 8 files changed, 77 insertions(+), 284 deletions(-) diff --git a/crates/codegraph-core/src/extractors/cpp.rs b/crates/codegraph-core/src/extractors/cpp.rs index 676f1105f..0ed8f4cda 100644 --- a/crates/codegraph-core/src/extractors/cpp.rs +++ b/crates/codegraph-core/src/extractors/cpp.rs @@ -20,49 +20,9 @@ impl SymbolExtractor for CppExtractor { // ── Type inference ────────────────────────────────────────────────────────── fn match_cpp_type_map(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) { - match node.kind() { - "declaration" => { - if let Some(type_node) = node.child_by_field_name("type") { - let type_name = node_text(&type_node, source); - for i in 0..node.child_count() { - if let Some(child) = node.child(i) { - if child.kind() == "init_declarator" || child.kind() == "identifier" { - let name_node = if child.kind() == "init_declarator" { - child.child_by_field_name("declarator") - } else { - Some(child) - }; - if let Some(name_node) = name_node { - let final_name = unwrap_cpp_declarator(&name_node, source); - if !final_name.is_empty() { - symbols.type_map.push(TypeMapEntry { - name: final_name, - type_name: type_name.to_string(), - confidence: 0.9, - }); - } - } - } - } - } - } - } - "parameter_declaration" => { - if let Some(type_node) = node.child_by_field_name("type") { - if let Some(decl) = node.child_by_field_name("declarator") { - let name = unwrap_cpp_declarator(&decl, source); - if !name.is_empty() { - symbols.type_map.push(TypeMapEntry { - name, - type_name: node_text(&type_node, source).to_string(), - confidence: 0.9, - }); - } - } - } - } - _ => {} - } + // Delegate the shared C-family declaration / parameter_declaration walker + // to the helper; supply the C++ declarator unwrap closure. + match_c_family_type_map(node, source, symbols, unwrap_cpp_declarator); } fn unwrap_cpp_declarator(node: &Node, source: &[u8]) -> String { @@ -353,9 +313,9 @@ fn handle_cpp_preproc_include(node: &Node, source: &[u8], symbols: &mut FileSymb let name = last.strip_suffix(".h") .or_else(|| last.strip_suffix(".hpp")) .unwrap_or(last); - let mut imp = Import::new(path.to_string(), vec![name.to_string()], start_line(node)); - imp.c_include = Some(true); - symbols.imports.push(imp); + push_import(symbols, node, path.to_string(), vec![name.to_string()], |imp| { + imp.c_include = Some(true); + }); } } } @@ -364,12 +324,7 @@ fn handle_cpp_call_expression(node: &Node, source: &[u8], symbols: &mut FileSymb if let Some(fn_node) = node.child_by_field_name("function") { match fn_node.kind() { "identifier" | "qualified_identifier" | "scoped_identifier" => { - symbols.calls.push(Call { - name: node_text(&fn_node, source).to_string(), - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, node_text(&fn_node, source).to_string()); } "field_expression" => { let name = named_child_text(&fn_node, "field", source) @@ -377,20 +332,10 @@ fn handle_cpp_call_expression(node: &Node, source: &[u8], symbols: &mut FileSymb .unwrap_or_else(|| node_text(&fn_node, source).to_string()); let receiver = named_child_text(&fn_node, "argument", source) .map(|s| s.to_string()); - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver, - }); + push_call(symbols, node, name, receiver, None); } _ => { - symbols.calls.push(Call { - name: node_text(&fn_node, source).to_string(), - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, node_text(&fn_node, source).to_string()); } } } diff --git a/crates/codegraph-core/src/extractors/cuda.rs b/crates/codegraph-core/src/extractors/cuda.rs index b8c67127a..f322a44c5 100644 --- a/crates/codegraph-core/src/extractors/cuda.rs +++ b/crates/codegraph-core/src/extractors/cuda.rs @@ -44,49 +44,10 @@ impl SymbolExtractor for CudaExtractor { /// nodes. Mirrors `match_cpp_type_map` in `cpp.rs` — the CUDA grammar shares /// these C++ node types, so the same logic works unchanged. fn match_cuda_type_map(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) { - match node.kind() { - "declaration" => { - if let Some(type_node) = node.child_by_field_name("type") { - let type_name = node_text(&type_node, source); - for i in 0..node.child_count() { - if let Some(child) = node.child(i) { - if child.kind() == "init_declarator" || child.kind() == "identifier" { - let name_node = if child.kind() == "init_declarator" { - child.child_by_field_name("declarator") - } else { - Some(child) - }; - if let Some(name_node) = name_node { - let final_name = unwrap_cuda_declarator(&name_node, source); - if !final_name.is_empty() { - symbols.type_map.push(TypeMapEntry { - name: final_name, - type_name: type_name.to_string(), - confidence: 0.9, - }); - } - } - } - } - } - } - } - "parameter_declaration" => { - if let Some(type_node) = node.child_by_field_name("type") { - if let Some(decl) = node.child_by_field_name("declarator") { - let name = unwrap_cuda_declarator(&decl, source); - if !name.is_empty() { - symbols.type_map.push(TypeMapEntry { - name, - type_name: node_text(&type_node, source).to_string(), - confidence: 0.9, - }); - } - } - } - } - _ => {} - } + // Delegate to the shared C-family walker; pass the CUDA declarator unwrap + // closure so pointer / reference / function declarators yield the bare + // identifier name. + match_c_family_type_map(node, source, symbols, unwrap_cuda_declarator); } // ── CUDA-specific qualifiers ──────────────────────────────────────────────── @@ -522,13 +483,9 @@ fn handle_cuda_preproc_include(node: &Node, source: &[u8], symbols: &mut FileSym .or_else(|| last.strip_suffix(".hpp")) .or_else(|| last.strip_suffix(".h")) .unwrap_or(last); - let mut imp = Import::new( - path.to_string(), - vec![name.to_string()], - start_line(node), - ); - imp.c_include = Some(true); - symbols.imports.push(imp); + push_import(symbols, node, path.to_string(), vec![name.to_string()], |imp| { + imp.c_include = Some(true); + }); } } } @@ -540,24 +497,9 @@ fn handle_cuda_call_expression(node: &Node, source: &[u8], symbols: &mut FileSym .map(|s| s.to_string()) .unwrap_or_default(); let receiver = named_child_text(&fn_node, "argument", source).map(|s| s.to_string()); - if !name.is_empty() { - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver, - }); - } + push_call(symbols, node, name, receiver, None); } else { - let name = node_text(&fn_node, source).to_string(); - if !name.is_empty() { - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver: None, - }); - } + push_simple_call(symbols, node, node_text(&fn_node, source).to_string()); } } } diff --git a/crates/codegraph-core/src/extractors/java.rs b/crates/codegraph-core/src/extractors/java.rs index a7c4bf6d1..94dd99e87 100644 --- a/crates/codegraph-core/src/extractors/java.rs +++ b/crates/codegraph-core/src/extractors/java.rs @@ -36,11 +36,11 @@ fn match_java_type_map(node: &Node, source: &[u8], symbols: &mut FileSymbols, _d if let Some(child) = node.child(i) { if child.kind() == "variable_declarator" { if let Some(name_node) = child.child_by_field_name("name") { - symbols.type_map.push(TypeMapEntry { - name: node_text(&name_node, source).to_string(), - type_name: type_name.to_string(), - confidence: 0.9, - }); + push_type_map_entry( + symbols, + node_text(&name_node, source).to_string(), + type_name.to_string(), + ); } } } @@ -52,11 +52,11 @@ fn match_java_type_map(node: &Node, source: &[u8], symbols: &mut FileSymbols, _d if let Some(type_node) = node.child_by_field_name("type") { if let Some(type_name) = extract_java_type_name(&type_node, source) { if let Some(name_node) = node.child_by_field_name("name") { - symbols.type_map.push(TypeMapEntry { - name: node_text(&name_node, source).to_string(), - type_name: type_name.to_string(), - confidence: 0.9, - }); + push_type_map_entry( + symbols, + node_text(&name_node, source).to_string(), + type_name.to_string(), + ); } } } @@ -266,9 +266,9 @@ fn handle_import_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { let last = import_path.split('.').last().unwrap_or("").to_string(); vec![last] }; - let mut imp = Import::new(import_path, names, start_line(node)); - imp.java_import = Some(true); - symbols.imports.push(imp); + push_import(symbols, node, import_path, names, |imp| { + imp.java_import = Some(true); + }); } } @@ -276,12 +276,13 @@ fn handle_method_invocation(node: &Node, source: &[u8], symbols: &mut FileSymbol if let Some(name_node) = node.child_by_field_name("name") { let receiver = named_child_text(node, "object", source) .map(|s| s.to_string()); - symbols.calls.push(Call { - name: node_text(&name_node, source).to_string(), - line: start_line(node), - dynamic: None, + push_call( + symbols, + node, + node_text(&name_node, source).to_string(), receiver, - }); + None, + ); } } @@ -293,37 +294,25 @@ fn handle_object_creation(node: &Node, source: &[u8], symbols: &mut FileSymbols) Some(node_text(&type_node, source).to_string()) }; if let Some(name) = type_name { - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, name); } } // ── Extended kinds helpers ────────────────────────────────────────────────── fn extract_java_parameters(node: &Node, source: &[u8]) -> Vec { - let mut params = Vec::new(); - let params_node = node.child_by_field_name("parameters") + let params_node = node + .child_by_field_name("parameters") .or_else(|| find_child(node, "formal_parameters")); - if let Some(params_node) = params_node { - for i in 0..params_node.child_count() { - if let Some(child) = params_node.child(i) { - if child.kind() == "formal_parameter" || child.kind() == "spread_parameter" { - if let Some(name_node) = child.child_by_field_name("name") { - params.push(child_def( - node_text(&name_node, source).to_string(), - "parameter", - start_line(&child), - )); - } - } - } - } - } - params + extract_simple_parameters( + params_node, + source, + &ExtractParametersOptions { + param_kinds: &["formal_parameter", "spread_parameter"], + name_field: Some("name"), + fallback_to_identifier: false, + }, + ) } fn extract_java_class_fields(node: &Node, source: &[u8]) -> Vec { diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index 3a56d4d6d..d5403aa0f 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -61,11 +61,7 @@ fn match_js_type_map(node: &Node, source: &[u8], symbols: &mut FileSymbols, _dep // Type annotation: confidence 0.9 if let Some(type_anno) = find_child(node, "type_annotation") { if let Some(type_name) = extract_simple_type_name(&type_anno, source) { - symbols.type_map.push(TypeMapEntry { - name: var_name.to_string(), - type_name: type_name.to_string(), - confidence: 0.9, - }); + push_type_map_entry(symbols, var_name.to_string(), type_name.to_string()); } } // Constructor: confidence 1.0 (overrides annotation in edge builder) @@ -91,11 +87,11 @@ fn match_js_type_map(node: &Node, source: &[u8], symbols: &mut FileSymbols, _dep if name_node.kind() == "identifier" { if let Some(type_anno) = find_child(node, "type_annotation") { if let Some(type_name) = extract_simple_type_name(&type_anno, source) { - symbols.type_map.push(TypeMapEntry { - name: node_text(&name_node, source).to_string(), - type_name: type_name.to_string(), - confidence: 0.9, - }); + push_type_map_entry( + symbols, + node_text(&name_node, source).to_string(), + type_name.to_string(), + ); } } } @@ -333,12 +329,7 @@ fn handle_new_expr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { let Some(ctor) = ctor else { return }; match ctor.kind() { "identifier" => { - symbols.calls.push(Call { - name: node_text(&ctor, source).to_string(), - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, node_text(&ctor, source).to_string()); } "member_expression" => { if let Some(call_info) = extract_call_info(&ctor, node, source) { diff --git a/crates/codegraph-core/src/extractors/julia.rs b/crates/codegraph-core/src/extractors/julia.rs index 61acb77a9..f8ceeb6c1 100644 --- a/crates/codegraph-core/src/extractors/julia.rs +++ b/crates/codegraph-core/src/extractors/julia.rs @@ -482,12 +482,7 @@ fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { match func_node.kind() { "identifier" => { - symbols.calls.push(Call { - name: node_text(&func_node, source).to_string(), - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, node_text(&func_node, source).to_string()); } "field_expression" | "scoped_identifier" => { let raw = node_text(&func_node, source); @@ -495,19 +490,9 @@ fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { if parts.len() >= 2 { let last = parts.last().copied().unwrap_or(""); let receiver = parts[..parts.len() - 1].join("."); - symbols.calls.push(Call { - name: last.to_string(), - line: start_line(node), - dynamic: None, - receiver: Some(receiver), - }); + push_call(symbols, node, last.to_string(), Some(receiver), None); } else { - symbols.calls.push(Call { - name: raw.to_string(), - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, raw.to_string()); } } _ => {} diff --git a/crates/codegraph-core/src/extractors/objc.rs b/crates/codegraph-core/src/extractors/objc.rs index 50e68140b..edd029f88 100644 --- a/crates/codegraph-core/src/extractors/objc.rs +++ b/crates/codegraph-core/src/extractors/objc.rs @@ -226,11 +226,7 @@ fn handle_at_import(node: &Node, source: &[u8], symbols: &mut FileSymbols) { .or_else(|| find_child(node, "identifier")); if let Some(m) = module_node { let name = node_text(&m, source).to_string(); - symbols.imports.push(Import::new( - name.clone(), - vec![name], - start_line(node), - )); + push_import(symbols, node, name.clone(), vec![name], |_| {}); } } @@ -329,14 +325,7 @@ fn handle_c_call_expr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { (node_text(&fn_node, source).to_string(), None) }; - if !name.is_empty() { - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver, - }); - } + push_call(symbols, node, name, receiver, None); } /// `[receiver selector:arg ...]` message send. The grammar gives every @@ -347,16 +336,7 @@ fn handle_message_expr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { .map(|n| node_text(&n, source).to_string()); let selector = build_message_selector(node, source); - if selector.is_empty() { - return; - } - - symbols.calls.push(Call { - name: selector, - line: start_line(node), - dynamic: None, - receiver, - }); + push_call(symbols, node, selector, receiver, None); } // ── Helpers ─────────────────────────────────────────────────────────────── diff --git a/crates/codegraph-core/src/extractors/r_lang.rs b/crates/codegraph-core/src/extractors/r_lang.rs index d5d89bdb8..3686c562e 100644 --- a/crates/codegraph-core/src/extractors/r_lang.rs +++ b/crates/codegraph-core/src/extractors/r_lang.rs @@ -177,12 +177,7 @@ fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { match func_node.kind() { "identifier" => { - symbols.calls.push(Call { - name: func_text.to_string(), - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, func_text.to_string()); } "namespace_operator" => { // `pkg::func` — receiver is the package; name is the function. @@ -190,12 +185,7 @@ fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { if parts.len() >= 2 { let name = parts[parts.len() - 1].to_string(); let receiver = parts[..parts.len() - 1].join("::"); - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver: Some(receiver), - }); + push_call(symbols, node, name, Some(receiver), None); } } _ => {} @@ -287,22 +277,14 @@ fn strip_string_quotes(node: &Node, source: &[u8]) -> String { fn handle_library_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { if let Some(pkg) = first_argument_value(node, source, true) { - symbols.imports.push(Import::new( - pkg.clone(), - vec![pkg], - start_line(node), - )); + push_import(symbols, node, pkg.clone(), vec![pkg], |_| {}); } } fn handle_source_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // source() only accepts string literals — `source(varname)` is not an import. if let Some(path) = first_argument_value(node, source, false) { - symbols.imports.push(Import::new( - path, - vec!["source".to_string()], - start_line(node), - )); + push_import(symbols, node, path, vec!["source".to_string()], |_| {}); } } @@ -344,12 +326,7 @@ fn handle_set_generic(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // recursive walk of the anonymous function argument. fn handle_set_method(node: &Node, source: &[u8], symbols: &mut FileSymbols) { if let Some(name) = first_argument_value(node, source, false) { - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver: None, - }); + push_simple_call(symbols, node, name); } } diff --git a/crates/codegraph-core/src/extractors/solidity.rs b/crates/codegraph-core/src/extractors/solidity.rs index 0302250ee..313b259fa 100644 --- a/crates/codegraph-core/src/extractors/solidity.rs +++ b/crates/codegraph-core/src/extractors/solidity.rs @@ -459,40 +459,24 @@ fn handle_call_expression(node: &Node, source: &[u8], symbols: &mut FileSymbols) _ => (node_text(&func_node, source).to_string(), None), }; - if !name.is_empty() { - symbols.calls.push(Call { - name, - line: start_line(node), - dynamic: None, - receiver, - }); - } + push_call(symbols, node, name, receiver, None); } // ── Helpers ────────────────────────────────────────────────────────────────── fn extract_sol_params(func_node: &Node, source: &[u8]) -> Vec { - let mut params = Vec::new(); let param_list = func_node .child_by_field_name("parameters") .or_else(|| find_child(func_node, "parameter_list")); - let Some(param_list) = param_list else { - return params; - }; - for i in 0..param_list.child_count() { - let Some(param) = param_list.child(i) else { continue }; - if param.kind() != "parameter" { - continue; - } - if let Some(name_node) = param.child_by_field_name("name") { - params.push(child_def( - node_text(&name_node, source).to_string(), - "parameter", - start_line(¶m), - )); - } - } - params + extract_simple_parameters( + param_list, + source, + &ExtractParametersOptions { + param_kinds: &["parameter"], + name_field: Some("name"), + fallback_to_identifier: false, + }, + ) } /// Find the name of an enclosing contract/interface/library, if any. From d9bbc8f4d1298521a606612254ca3116d42d4cba Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:07:23 -0600 Subject: [PATCH 07/27] refactor(extractors-rs): break elixir param/map binding cycle Convert collect_elixir_param_identifiers from mutual-recursion with collect_elixir_map_bindings into a single iterative worklist traversal. Map/list/tuple/binary-operator dispatch is now done via three leaf helpers (push_elixir_sequence_items, push_elixir_map_values, push_elixir_binary_operator_operands) that push child nodes onto the worklist instead of calling back into the main function. This removes the function-level cycle flagged by codegraph (8 -> 7 cycles) and mirrors the TS refactor in 5abe6ad without changing extractor semantics. docs check acknowledged: internal refactor only. --- .../codegraph-core/src/extractors/elixir.rs | 143 ++++++++++-------- 1 file changed, 76 insertions(+), 67 deletions(-) diff --git a/crates/codegraph-core/src/extractors/elixir.rs b/crates/codegraph-core/src/extractors/elixir.rs index b9fa8686a..cddacf42c 100644 --- a/crates/codegraph-core/src/extractors/elixir.rs +++ b/crates/codegraph-core/src/extractors/elixir.rs @@ -157,84 +157,93 @@ fn extract_elixir_params(args: &Node, source: &[u8]) -> Vec { params } -/// Recursively walk a parameter pattern and emit each bound identifier as a -/// `parameter` child. Handles bare identifiers, default-value `a \\ default`, -/// list-cons `[head | tail]`, list `[a, b, c]`, tuple `{x, y}`, and -/// map / struct destructuring (`%{k: v}`, `%Foo{k: v}`). -fn collect_elixir_param_identifiers(node: &Node, source: &[u8], out: &mut Vec) { - match node.kind() { - "identifier" => { - out.push(child_def( - node_text(node, source).to_string(), - "parameter", - start_line(node), - )); - } - "binary_operator" => { - // `name \\ default` (default-value) binds the left operand only. - // `head | tail` (list-cons, appears inside a `list` pattern) binds both operands. - let Some(op) = node.child(1) else { return }; - match op.kind() { - "\\\\" => { - if let Some(left) = node.child(0) { - collect_elixir_param_identifiers(&left, source, out); - } - } - "|" => { - if let Some(left) = node.child(0) { - collect_elixir_param_identifiers(&left, source, out); - } - if let Some(right) = node.child(2) { - collect_elixir_param_identifiers(&right, source, out); - } - } - _ => {} +/// Walk a parameter pattern and emit each bound identifier as a `parameter` +/// child. Handles bare identifiers, default-value `a \\ default`, list-cons +/// `[head | tail]`, list `[a, b, c]`, tuple `{x, y}`, and map / struct +/// destructuring (`%{k: v}`, `%Foo{k: v}`). +/// +/// Implemented as an iterative worklist (rather than recursion + helpers) so +/// the call graph has no function-level cycle: only one function performs the +/// traversal and it invokes only leaf helpers (`push_elixir_sequence_items`, +/// `push_elixir_map_values`, `push_elixir_binary_operator_operands`). +fn collect_elixir_param_identifiers(root: &Node, source: &[u8], out: &mut Vec) { + let mut stack: Vec = vec![*root]; + while let Some(node) = stack.pop() { + match node.kind() { + "identifier" => { + out.push(child_def( + node_text(&node, source).to_string(), + "parameter", + start_line(&node), + )); } - } - "list" => { - // `[a, b, c]` or `[head | tail]` — walk children, skipping punctuation. - // The `|` cons case is handled by the `binary_operator` arm on recursion. - for i in 0..node.child_count() { - let Some(c) = node.child(i) else { continue }; - let k = c.kind(); - if k == "[" || k == "]" || k == "," { continue; } - collect_elixir_param_identifiers(&c, source, out); + "binary_operator" => { + push_elixir_binary_operator_operands(&node, &mut stack); + } + "list" | "tuple" => { + push_elixir_sequence_items(&node, &mut stack); + } + "map" => { + push_elixir_map_values(&node, &mut stack); } + _ => {} } - "tuple" => { - for i in 0..node.child_count() { - let Some(c) = node.child(i) else { continue }; - let k = c.kind(); - if k == "{" || k == "}" || k == "," { continue; } - collect_elixir_param_identifiers(&c, source, out); + } +} + +/// Push the binding-relevant operands of a `binary_operator` parameter onto the +/// worklist: +/// - `name \\ default` (default-value) binds the left operand only. +/// - `head | tail` (list-cons, appears inside a `list` pattern) binds both. +fn push_elixir_binary_operator_operands<'a>(node: &Node<'a>, stack: &mut Vec>) { + let Some(op) = node.child(1) else { return }; + match op.kind() { + "\\\\" => { + if let Some(left) = node.child(0) { + stack.push(left); } } - "map" => { - // `%{k: v}` or `%Foo{k: v}` — walk map_content > keywords > pair and emit - // each pair's value side (the bound name). The leading `struct` alias is a - // type, not a bound identifier, so it is intentionally skipped. - for i in 0..node.child_count() { - let Some(c) = node.child(i) else { continue }; - if c.kind() == "map_content" { - collect_elixir_map_bindings(&c, source, out); - } + "|" => { + if let Some(right) = node.child(2) { + stack.push(right); + } + if let Some(left) = node.child(0) { + stack.push(left); } } _ => {} } } -fn collect_elixir_map_bindings(content: &Node, source: &[u8], out: &mut Vec) { - for i in 0..content.child_count() { - let Some(kws) = content.child(i) else { continue }; - if kws.kind() != "keywords" { continue; } - for j in 0..kws.child_count() { - let Some(pair) = kws.child(j) else { continue }; - if pair.kind() != "pair" { continue; } - for k in 0..pair.child_count() { - let Some(part) = pair.child(k) else { continue }; - if part.kind() == "keyword" { continue; } - collect_elixir_param_identifiers(&part, source, out); +/// Push the binding-relevant elements of a `list` or `tuple` parameter onto +/// the worklist, skipping punctuation tokens. +fn push_elixir_sequence_items<'a>(node: &Node<'a>, stack: &mut Vec>) { + for i in 0..node.child_count() { + let Some(c) = node.child(i) else { continue }; + let k = c.kind(); + if k == "[" || k == "]" || k == "{" || k == "}" || k == "," { continue; } + stack.push(c); + } +} + +/// Push the value side of every pair in a `map` or `%Foo{...}` parameter onto +/// the worklist. The struct alias (`Foo`) is a type, not a bound identifier, so +/// the leading `struct` child is intentionally skipped. +fn push_elixir_map_values<'a>(node: &Node<'a>, stack: &mut Vec>) { + for i in 0..node.child_count() { + let Some(content) = node.child(i) else { continue }; + if content.kind() != "map_content" { continue; } + for j in 0..content.child_count() { + let Some(kws) = content.child(j) else { continue }; + if kws.kind() != "keywords" { continue; } + for k in 0..kws.child_count() { + let Some(pair) = kws.child(k) else { continue }; + if pair.kind() != "pair" { continue; } + for p in 0..pair.child_count() { + let Some(part) = pair.child(p) else { continue }; + if part.kind() == "keyword" { continue; } + stack.push(part); + } } } } From 24c8cf51ec2074dffcb65f0a337b2ed5d36dd989 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:13:37 -0600 Subject: [PATCH 08/27] refactor(ast-analysis): break visitor-utils destructuring cycle --- src/ast-analysis/visitor-utils.ts | 132 +++++++++++++++++++----------- 1 file changed, 86 insertions(+), 46 deletions(-) diff --git a/src/ast-analysis/visitor-utils.ts b/src/ast-analysis/visitor-utils.ts index 530787d2d..4b161c2e3 100644 --- a/src/ast-analysis/visitor-utils.ts +++ b/src/ast-analysis/visitor-utils.ts @@ -88,78 +88,118 @@ export function extractParams( return result; } -/** Extract names from a rest parameter (e.g. `...args`). */ -function extractRestParamNames(node: TreeSitterNode, rules: LanguageRules): string[] { - const nameNode = node.childForFieldName('name'); - if (nameNode) return [nameNode.text]; - for (const child of node.namedChildren) { - if (child.type === rules.paramIdentifier) return [child.text]; - } - return []; -} - -/** Extract names from an object destructuring pattern (e.g. `{ a, b: c }`). */ -function extractObjectDestructNames(node: TreeSitterNode, rules: LanguageRules): string[] { - const names: string[] = []; - for (const child of node.namedChildren) { - if (rules.shorthandPropPattern && child.type === rules.shorthandPropPattern) { - names.push(child.text); - } else if (rules.pairPatternType && child.type === rules.pairPatternType) { - const value = child.childForFieldName('value'); - if (value) names.push(...extractParamNames(value, rules)); - } else if (rules.restParamType && child.type === rules.restParamType) { - names.push(...extractParamNames(child, rules)); - } - } - return names; -} - -/** Extract names from an array destructuring pattern (e.g. `[a, b]`). */ -function extractArrayDestructNames(node: TreeSitterNode, rules: LanguageRules): string[] { - const names: string[] = []; - for (const child of node.namedChildren) { - names.push(...extractParamNames(child, rules)); - } - return names; -} - /** - * Extract parameter names from a single parameter node. + * Resolve a single parameter node to either a direct list of names (base case) + * or a list of child nodes that still need processing. Returns `null` if the + * node yields nothing. + * + * This base case keeps destructuring helpers from recursing back into + * `extractParamNames`, breaking the 3-node mutual recursion cycle between + * `extractParamNames`, `extractObjectDestructNames`, and `extractArrayDestructNames`. */ -export function extractParamNames(node: TreeSitterNode | null, rules: LanguageRules): string[] { - if (!node) return []; +function resolveParamNode( + node: TreeSitterNode, + rules: LanguageRules, +): { names?: string[]; next?: TreeSitterNode[] } | null { const t = node.type; if (rules.extractParamName) { const result = rules.extractParamName(node); - if (result) return result; + if (result) return { names: result }; } - if (t === rules.paramIdentifier) return [node.text]; + if (t === rules.paramIdentifier) return { names: [node.text] }; if (rules.paramWrapperTypes.has(t)) { const pattern = node.childForFieldName('pattern') || node.childForFieldName('name'); - return pattern ? extractParamNames(pattern, rules) : []; + return pattern ? { next: [pattern] } : null; } if (rules.defaultParamType && t === rules.defaultParamType) { const left = node.childForFieldName('left') || node.childForFieldName('name'); - return left ? extractParamNames(left, rules) : []; + return left ? { next: [left] } : null; } if (rules.restParamType && t === rules.restParamType) { - return extractRestParamNames(node, rules); + const nameNode = node.childForFieldName('name'); + if (nameNode) return { names: [nameNode.text] }; + for (const child of node.namedChildren) { + if (child.type === rules.paramIdentifier) return { names: [child.text] }; + } + return null; } if (rules.objectDestructType && t === rules.objectDestructType) { - return extractObjectDestructNames(node, rules); + return { next: collectObjectDestructChildren(node, rules) }; } if (rules.arrayDestructType && t === rules.arrayDestructType) { - return extractArrayDestructNames(node, rules); + return { next: [...node.namedChildren] }; + } + + return null; +} + +/** + * Collect child nodes from an object destructuring pattern that should be + * processed for further name extraction. Returns nodes (not names) so the + * caller drives traversal via a worklist instead of recursion. + */ +function collectObjectDestructChildren( + node: TreeSitterNode, + rules: LanguageRules, +): TreeSitterNode[] { + const next: TreeSitterNode[] = []; + for (const child of node.namedChildren) { + if (rules.shorthandPropPattern && child.type === rules.shorthandPropPattern) { + // Shorthand prop is a direct identifier — handled in the worklist + // by `resolveParamNode` once requeued. + next.push(child); + } else if (rules.pairPatternType && child.type === rules.pairPatternType) { + const value = child.childForFieldName('value'); + if (value) next.push(value); + } else if (rules.restParamType && child.type === rules.restParamType) { + next.push(child); + } } + return next; +} - return []; +/** + * Extract parameter names from a single parameter node. + * + * Uses an iterative worklist to handle nested destructuring (objects, arrays, + * defaults, rest, wrappers) without mutual recursion through helper functions. + */ +export function extractParamNames(node: TreeSitterNode | null, rules: LanguageRules): string[] { + if (!node) return []; + + const names: string[] = []; + const stack: TreeSitterNode[] = [node]; + + while (stack.length > 0) { + const current = stack.pop(); + if (!current) continue; + + // Shorthand identifier inside an object destructuring is just the node's text. + if (rules.shorthandPropPattern && current.type === rules.shorthandPropPattern) { + names.push(current.text); + continue; + } + + const resolved = resolveParamNode(current, rules); + if (!resolved) continue; + if (resolved.names) names.push(...resolved.names); + if (resolved.next) { + // Push in reverse so traversal order matches the previous recursive order. + for (let i = resolved.next.length - 1; i >= 0; i--) { + const child = resolved.next[i]; + if (child) stack.push(child); + } + } + } + + return names; } /** From 4f34034038357c26f3cfe5fc6b532caba84bf39c Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:21:47 -0600 Subject: [PATCH 09/27] refactor(ast-analysis): decompose engine and visitors --- src/ast-analysis/engine.ts | 206 ++++++++++++------ .../visitors/ast-store-visitor.ts | 173 +++++++++------ src/ast-analysis/visitors/dataflow-visitor.ts | 137 ++++++++---- 3 files changed, 339 insertions(+), 177 deletions(-) diff --git a/src/ast-analysis/engine.ts b/src/ast-analysis/engine.ts index c96989437..958485147 100644 --- a/src/ast-analysis/engine.ts +++ b/src/ast-analysis/engine.ts @@ -753,6 +753,146 @@ function allNativeDataComplete( // ─── Public API ────────────────────────────────────────────────────────── +/** Distribute the per-file walk time equally among the visitors that ran. */ +function accumulateWalkTime( + timing: AnalysisTiming, + walkMs: number, + astVisitor: Visitor | null, + complexityVisitor: Visitor | null, + cfgVisitor: Visitor | null, + dataflowVisitor: Visitor | null, +): void { + const activeCount = [astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor].filter( + Boolean, + ).length; + if (activeCount === 0) return; + + const share = walkMs / activeCount; + if (astVisitor) timing.astMs += share; + if (complexityVisitor) timing.complexityMs += share; + if (cfgVisitor) timing.cfgMs += share; + if (dataflowVisitor) timing.dataflowMs += share; +} + +/** Apply visitor walk results to the per-file symbols/definitions. */ +function applyVisitorResults( + results: WalkResults, + symbols: ExtractorOutput, + langId: string, + astVisitor: Visitor | null, + complexityVisitor: Visitor | null, + cfgVisitor: Visitor | null, + dataflowVisitor: Visitor | null, +): void { + const defs = symbols.definitions || []; + + if (astVisitor) { + const astRows = (results['ast-store'] || []) as ASTNodeRow[]; + if (astRows.length > 0) symbols.astNodes = astRows; + } + + if (complexityVisitor) storeComplexityResults(results, defs, langId); + if (cfgVisitor) storeCfgResults(results, defs); + if (dataflowVisitor) symbols.dataflow = results.dataflow as DataflowResult; +} + +/** Process a single file: set up visitors, walk the tree, and apply results. */ +function processFileWalk( + db: BetterSqlite3Database, + relPath: string, + symbols: ExtractorOutput, + langId: string, + opts: AnalysisOpts, + timing: AnalysisTiming, +): void { + if (!symbols._tree) return; + + const { visitors, walkerOpts, astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor } = + setupVisitors(db, relPath, symbols, langId, opts); + + if (visitors.length === 0) return; + + const walkStart = performance.now(); + const results = walkWithVisitors(symbols._tree.rootNode, visitors, langId, walkerOpts); + const walkMs = performance.now() - walkStart; + + accumulateWalkTime(timing, walkMs, astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor); + applyVisitorResults( + results, + symbols, + langId, + astVisitor, + complexityVisitor, + cfgVisitor, + dataflowVisitor, + ); +} + +/** + * Unified pre-walk: run all applicable visitors in a single DFS per file. + * Returns the total wall-clock time for diagnostics. + */ +function runUnifiedWalkPass( + db: BetterSqlite3Database, + fileSymbols: Map, + extToLang: Map, + opts: AnalysisOpts, + timing: AnalysisTiming, +): number { + const t0walk = performance.now(); + + for (const [relPath, symbols] of fileSymbols) { + if (!symbols._tree) continue; + + const ext = path.extname(relPath).toLowerCase(); + const langId = symbols._langId || extToLang.get(ext); + if (!langId) continue; + + processFileWalk(db, relPath, symbols, langId, opts, timing); + } + + return performance.now() - t0walk; +} + +/** Try native Rust standalone analysis to fill gaps before WASM fallback. */ +function tryNativeStandaloneAnalysis( + fileSymbols: Map, + rootDir: string, + opts: AnalysisOpts, + extToLang: Map, +): void { + const native = loadNative(); + if (!native?.analyzeComplexity && !native?.buildCfgAnalysis && !native?.extractDataflowAnalysis) { + return; + } + const t0native = performance.now(); + runNativeAnalysis(native, fileSymbols, rootDir, opts, extToLang); + debug(`native standalone analysis: ${(performance.now() - t0native).toFixed(1)}ms`); +} + +/** + * Fast path: when all files were parsed by the native engine with full analysis, + * skip WASM re-parse and JS visitor walks entirely and go straight to DB persistence. + * Returns true if the fast path handled the work. + */ +async function runFastPathIfApplicable( + db: BetterSqlite3Database, + fileSymbols: Map, + rootDir: string, + opts: AnalysisOpts, + engineOpts: EngineOpts | undefined, + timing: AnalysisTiming, +): Promise { + if (!allNativeDataComplete(fileSymbols, opts)) return false; + + debug('native full-analysis fast path: all data present, skipping WASM/visitor passes'); + const doComplexity = opts.complexity !== false; + const doCfg = opts.cfg !== false; + if (doComplexity && doCfg) reconcileCfgCyclomatic(fileSymbols); + await delegateToBuildFunctions(db, fileSymbols, rootDir, opts, engineOpts, timing); + return true; +} + export async function runAnalyses( db: BetterSqlite3Database, fileSymbols: Map, @@ -771,80 +911,24 @@ export async function runAnalyses( const extToLang = buildExtToLangMap(); - // Fast path: when all files were parsed by the native engine with full analysis - // (parseFilesFull), all data is already present — skip WASM re-parse and JS - // visitor walks entirely, go straight to DB persistence. - if (allNativeDataComplete(fileSymbols, opts)) { - debug('native full-analysis fast path: all data present, skipping WASM/visitor passes'); - if (doComplexity && doCfg) reconcileCfgCyclomatic(fileSymbols); - await delegateToBuildFunctions(db, fileSymbols, rootDir, opts, engineOpts, timing); + if (await runFastPathIfApplicable(db, fileSymbols, rootDir, opts, engineOpts, timing)) { return timing; } // Native analysis pass: try Rust standalone functions before WASM fallback. // This fills in complexity/CFG/dataflow for files that the native parse pipeline // missed, avoiding the need to parse with WASM + run JS visitors. - const native = loadNative(); - if (native?.analyzeComplexity || native?.buildCfgAnalysis || native?.extractDataflowAnalysis) { - const t0native = performance.now(); - runNativeAnalysis(native, fileSymbols, rootDir, opts, extToLang); - debug(`native standalone analysis: ${(performance.now() - t0native).toFixed(1)}ms`); - } + tryNativeStandaloneAnalysis(fileSymbols, rootDir, opts, extToLang); // WASM pre-parse for files that still need it (AST store, or native gaps) await ensureWasmTreesIfNeeded(fileSymbols, opts, rootDir); - // Unified pre-walk: run all applicable visitors in a single DFS per file. // Time each file's walk and distribute equally among active visitors // so that phase timers (astMs, complexityMs, etc.) reflect real work — not // just the DB-write tail in delegateToBuildFunctions. - const t0walk = performance.now(); - - for (const [relPath, symbols] of fileSymbols) { - if (!symbols._tree) continue; - - const ext = path.extname(relPath).toLowerCase(); - const langId = symbols._langId || extToLang.get(ext); - if (!langId) continue; - - const { visitors, walkerOpts, astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor } = - setupVisitors(db, relPath, symbols, langId, opts); - - if (visitors.length === 0) continue; - - const walkStart = performance.now(); - const results = walkWithVisitors(symbols._tree.rootNode, visitors, langId, walkerOpts); - const walkMs = performance.now() - walkStart; - - // Distribute walk time equally among active visitors - const activeCount = [astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor].filter( - Boolean, - ).length; - if (activeCount > 0) { - const share = walkMs / activeCount; - if (astVisitor) timing.astMs += share; - if (complexityVisitor) timing.complexityMs += share; - if (cfgVisitor) timing.cfgMs += share; - if (dataflowVisitor) timing.dataflowMs += share; - } - - const defs = symbols.definitions || []; - - if (astVisitor) { - const astRows = (results['ast-store'] || []) as ASTNodeRow[]; - if (astRows.length > 0) symbols.astNodes = astRows; - } - - if (complexityVisitor) storeComplexityResults(results, defs, langId); - if (cfgVisitor) storeCfgResults(results, defs); - if (dataflowVisitor) symbols.dataflow = results.dataflow as DataflowResult; - } - - // Total wall-clock time for the unified walk loop, including per-file - // setupVisitors overhead. Walk time is already distributed into per-phase - // timers above, so this field overlaps with (astMs + complexityMs + ...). - // It is kept as a diagnostic cross-check, not an additive bucket. - timing._unifiedWalkMs = performance.now() - t0walk; + // _unifiedWalkMs is kept as a diagnostic cross-check (overlaps with the + // per-phase timers above, not additive). + timing._unifiedWalkMs = runUnifiedWalkPass(db, fileSymbols, extToLang, opts, timing); // Reconcile: apply CFG-derived cyclomatic override for any definitions that have // both precomputed complexity and CFG data but whose cyclomatic was never overridden. diff --git a/src/ast-analysis/visitors/ast-store-visitor.ts b/src/ast-analysis/visitors/ast-store-visitor.ts index 661ceae2e..dd63515be 100644 --- a/src/ast-analysis/visitors/ast-store-visitor.ts +++ b/src/ast-analysis/visitors/ast-store-visitor.ts @@ -181,46 +181,17 @@ function newTypesFor(astTypeMap: Record): Set { return s; } -export function createAstStoreVisitor( - astTypeMap: Record, - defs: Definition[], - relPath: string, - nodeIdMap: Map, - stringConfig: AstStringConfig = DEFAULT_STRING_CONFIG, - stopRecurseKinds: ReadonlySet = new Set(), -): Visitor { - const rows: AstStoreRow[] = []; - const matched = new Set(); - const newTypes = newTypesFor(astTypeMap); - // When nodeIdMap is empty, parentNodeId resolution is wasted work — the - // worker passes an empty map and the main thread re-resolves against its - // own DB-populated map in features/ast.ts::collectFileAstRows. Skip the - // findParentDef linear scan in that case. - const skipParentLookup = nodeIdMap.size === 0; - - function findParentDef(line: number): Definition | null { - let best: Definition | null = null; - for (const def of defs) { - if (def.line <= line && (def.endLine == null || def.endLine >= line)) { - if (!best || (def.endLine ?? 0) - def.line < (best.endLine ?? 0) - best.line) { - best = def; - } - } - } - return best; - } - - function resolveParentNodeId(line: number): number | null { - if (skipParentLookup) return null; - const parentDef = findParentDef(line); - if (!parentDef) return null; - return nodeIdMap.get(`${parentDef.name}|${parentDef.kind}|${parentDef.line}`) || null; - } +type NameTextResult = { name: string | null | undefined; text: string | null; skip?: boolean }; +type KindHandler = (node: TreeSitterNode) => NameTextResult; - type NameTextResult = { name: string | null | undefined; text: string | null; skip?: boolean }; - type KindHandler = (node: TreeSitterNode) => NameTextResult; +const DEFAULT_NAME_TEXT_RESULT: NameTextResult = { name: undefined, text: null }; - const kindHandlers: Record = { +/** Build the per-kind resolver map for name/text extraction. */ +function buildKindHandlers( + newTypes: Set, + stringConfig: AstStringConfig, +): Record { + return { new: (node) => ({ name: extractConstructorName(node), text: truncate(node.text) }), throw: (node) => ({ name: extractThrowName(node, newTypes), @@ -234,31 +205,102 @@ export function createAstStoreVisitor( }, regex: (node) => ({ name: node.text || '?', text: truncate(node.text) }), }; - const defaultResult: NameTextResult = { name: undefined, text: null }; +} - function resolveNameAndText(node: TreeSitterNode, kind: string): NameTextResult { - const handler = kindHandlers[kind]; - return handler ? handler(node) : defaultResult; +/** Find the innermost definition whose line range contains `line`. */ +function findParentDef(line: number, defs: Definition[]): Definition | null { + let best: Definition | null = null; + for (const def of defs) { + if (def.line <= line && (def.endLine == null || def.endLine >= line)) { + if (!best || (def.endLine ?? 0) - def.line < (best.endLine ?? 0) - best.line) { + best = def; + } + } } + return best; +} + +/** Resolve the parent definition's node id for a given source line. */ +function resolveParentNodeId( + line: number, + defs: Definition[], + nodeIdMap: Map, + skipParentLookup: boolean, +): number | null { + if (skipParentLookup) return null; + const parentDef = findParentDef(line, defs); + if (!parentDef) return null; + return nodeIdMap.get(`${parentDef.name}|${parentDef.kind}|${parentDef.line}`) || null; +} - function collectNode(node: TreeSitterNode, kind: string): void { - if (matched.has(node.id)) return; +interface CollectCtx { + rows: AstStoreRow[]; + matched: Set; + relPath: string; + defs: Definition[]; + nodeIdMap: Map; + skipParentLookup: boolean; + kindHandlers: Record; +} - const resolved = resolveNameAndText(node, kind); - if (resolved.skip) return; +function collectNode(ctx: CollectCtx, node: TreeSitterNode, kind: string): void { + if (ctx.matched.has(node.id)) return; + + const handler = ctx.kindHandlers[kind]; + const resolved = handler ? handler(node) : DEFAULT_NAME_TEXT_RESULT; + if (resolved.skip) return; + + const line = node.startPosition.row + 1; + ctx.rows.push({ + file: ctx.relPath, + line, + kind, + name: resolved.name, + text: resolved.text, + receiver: null, + parentNodeId: resolveParentNodeId(line, ctx.defs, ctx.nodeIdMap, ctx.skipParentLookup), + }); + + ctx.matched.add(node.id); +} - rows.push({ - file: relPath, - line: node.startPosition.row + 1, - kind, - name: resolved.name, - text: resolved.text, - receiver: null, - parentNodeId: resolveParentNodeId(node.startPosition.row + 1), - }); +/** + * Resolve the kind for a tree-sitter node, or `null` if the node should be ignored. + * + * Gate with `hasOwn` because plain-object lookup walks Object.prototype: + * tree-sitter node types like `constructor` (Haskell sum-types: Left, + * Right) would otherwise resolve to `Object.prototype.constructor` (the + * Object() function), which then crashes the worker boundary with + * "function Object() { [native code] } could not be cloned" when the + * resulting astNodes row is structured-cloned back to the main thread. + */ +function resolveAstKind(node: TreeSitterNode, astTypeMap: Record): string | null { + if (!Object.hasOwn(astTypeMap, node.type)) return null; + return astTypeMap[node.type] || null; +} - matched.add(node.id); - } +export function createAstStoreVisitor( + astTypeMap: Record, + defs: Definition[], + relPath: string, + nodeIdMap: Map, + stringConfig: AstStringConfig = DEFAULT_STRING_CONFIG, + stopRecurseKinds: ReadonlySet = new Set(), +): Visitor { + const newTypes = newTypesFor(astTypeMap); + // When nodeIdMap is empty, parentNodeId resolution is wasted work — the + // worker passes an empty map and the main thread re-resolves against its + // own DB-populated map in features/ast.ts::collectFileAstRows. Skip the + // findParentDef linear scan in that case. + const ctx: CollectCtx = { + rows: [], + matched: new Set(), + relPath, + defs, + nodeIdMap, + skipParentLookup: nodeIdMap.size === 0, + kindHandlers: buildKindHandlers(newTypes, stringConfig), + }; return { name: 'ast-store', @@ -267,19 +309,12 @@ export function createAstStoreVisitor( // Guard: skip re-collection but do NOT skipChildren — node.id (memory address) // can be reused by tree-sitter, so a collision would incorrectly suppress an // unrelated subtree. The parent call's skipChildren handles the intended case. - if (matched.has(node.id)) return; - - // Gate with `hasOwn` because plain-object lookup walks Object.prototype: - // tree-sitter node types like `constructor` (Haskell sum-types: Left, - // Right) would otherwise resolve to `Object.prototype.constructor` (the - // Object() function), which then crashes the worker boundary with - // "function Object() { [native code] } could not be cloned" when the - // resulting astNodes row is structured-cloned back to the main thread. - if (!Object.hasOwn(astTypeMap, node.type)) return; - const kind = astTypeMap[node.type]; + if (ctx.matched.has(node.id)) return; + + const kind = resolveAstKind(node, astTypeMap); if (!kind) return; - collectNode(node, kind); + collectNode(ctx, node, kind); // Mirror the native walker's recursion policy. In JS/TS, the native // javascript.rs walker returns after collecting `new` or `throw` to @@ -293,7 +328,7 @@ export function createAstStoreVisitor( }, finish(): AstStoreRow[] { - return rows; + return ctx.rows; }, }; } diff --git a/src/ast-analysis/visitors/dataflow-visitor.ts b/src/ast-analysis/visitors/dataflow-visitor.ts index c3e4b46be..b66215577 100644 --- a/src/ast-analysis/visitors/dataflow-visitor.ts +++ b/src/ast-analysis/visitors/dataflow-visitor.ts @@ -405,6 +405,83 @@ function handleReturn( } } +/** Collect parameter entries for a function and push a new scope onto the stack. */ +function enterFunctionScope( + funcNode: TreeSitterNode, + rules: AnyRules, + scopeStack: ScopeEntry[], + parameters: DataflowParam[], +): void { + const name = functionName(funcNode, rules); + const paramsNode = funcNode.childForFieldName(rules.paramListField); + const paramList = extractParams(paramsNode, rules); + const paramMap = new Map(); + for (const p of paramList) { + paramMap.set(p.name, p.index); + if (name) { + parameters.push({ + funcName: name, + paramName: p.name, + paramIndex: p.index, + line: (paramsNode?.startPosition?.row ?? funcNode.startPosition.row) + 1, + }); + } + } + scopeStack.push({ funcName: name, funcNode, params: paramMap, locals: new Map() }); +} + +interface DataflowDispatchCtx { + rules: AnyRules; + scopeStack: ScopeEntry[]; + returns: DataflowReturnEntry[]; + assignments: DataflowAssignment[]; + argFlows: DataflowArgFlow[]; + mutations: DataflowMutation[]; + isCallNode: (t: string) => boolean; +} + +/** + * Route a node to the appropriate dataflow handler based on its type, or return + * `false` if no handler matched. Function-definition nodes are signalled by + * a `true` return so the caller can short-circuit. + */ +function dispatchDataflowNode(ctx: DataflowDispatchCtx, node: TreeSitterNode): boolean { + const { rules } = ctx; + const t = node.type; + + if (rules.functionNodes.has(t)) return true; + + if (rules.returnNode && t === rules.returnNode) { + handleReturn(node, rules, ctx.scopeStack, ctx.returns); + return true; + } + + if ( + (rules.varDeclaratorNode && t === rules.varDeclaratorNode) || + rules.varDeclaratorNodes?.has(t) + ) { + handleVarDeclarator(node, rules, ctx.scopeStack, ctx.assignments, ctx.isCallNode); + return true; + } + + if (ctx.isCallNode(t)) { + handleCallExpr(node, rules, ctx.scopeStack, ctx.argFlows); + return true; + } + + if (rules.assignmentNode && t === rules.assignmentNode) { + handleAssignment(node, rules, ctx.scopeStack, ctx.assignments, ctx.mutations, ctx.isCallNode); + return true; + } + + if (rules.expressionStmtNode && t === rules.expressionStmtNode) { + handleExprStmtMutation(node, rules, ctx.scopeStack, ctx.mutations, ctx.isCallNode); + return true; + } + + return false; +} + export function createDataflowVisitor(rules: AnyRules): Visitor { const isCallNode: (t: string) => boolean = rules.callNodes ? (t: string) => rules.callNodes.has(t) @@ -417,6 +494,16 @@ export function createDataflowVisitor(rules: AnyRules): Visitor { const mutations: DataflowMutation[] = []; const scopeStack: ScopeEntry[] = []; + const dispatchCtx: DataflowDispatchCtx = { + rules, + scopeStack, + returns, + assignments, + argFlows, + mutations, + isCallNode, + }; + return { name: 'dataflow', functionNodeTypes: rules.functionNodes, @@ -426,22 +513,7 @@ export function createDataflowVisitor(rules: AnyRules): Visitor { _funcName: string | null, _context: VisitorContext, ): void { - const name = functionName(funcNode, rules); - const paramsNode = funcNode.childForFieldName(rules.paramListField); - const paramList = extractParams(paramsNode, rules); - const paramMap = new Map(); - for (const p of paramList) { - paramMap.set(p.name, p.index); - if (name) { - parameters.push({ - funcName: name, - paramName: p.name, - paramIndex: p.index, - line: (paramsNode?.startPosition?.row ?? funcNode.startPosition.row) + 1, - }); - } - } - scopeStack.push({ funcName: name, funcNode, params: paramMap, locals: new Map() }); + enterFunctionScope(funcNode, rules, scopeStack, parameters); }, exitFunction( @@ -453,37 +525,8 @@ export function createDataflowVisitor(rules: AnyRules): Visitor { }, enterNode(node: TreeSitterNode, _context: VisitorContext): EnterNodeResult | undefined { - const t = node.type; - - if (rules.functionNodes.has(t)) return; - - if (rules.returnNode && t === rules.returnNode) { - handleReturn(node, rules, scopeStack, returns); - return; - } - - if (rules.varDeclaratorNode && t === rules.varDeclaratorNode) { - handleVarDeclarator(node, rules, scopeStack, assignments, isCallNode); - return; - } - if (rules.varDeclaratorNodes?.has(t)) { - handleVarDeclarator(node, rules, scopeStack, assignments, isCallNode); - return; - } - - if (isCallNode(t)) { - handleCallExpr(node, rules, scopeStack, argFlows); - return; - } - - if (rules.assignmentNode && t === rules.assignmentNode) { - handleAssignment(node, rules, scopeStack, assignments, mutations, isCallNode); - return; - } - - if (rules.expressionStmtNode && t === rules.expressionStmtNode) { - handleExprStmtMutation(node, rules, scopeStack, mutations, isCallNode); - } + dispatchDataflowNode(dispatchCtx, node); + return undefined; }, finish(): DataflowResultInternal { From dab4dcf8630259bb632c0cc31f247bda3069d3ca Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:32:39 -0600 Subject: [PATCH 10/27] refactor(builder): break pipeline cycle by extracting orchestrator-selection strategy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the native-orchestrator path out of pipeline.ts into two new stage modules: - stages/native-orchestrator.ts — tryNativeOrchestrator + post-native structure/analysis fallback + dropped-language detection/backfill. - stages/native-db-lifecycle.ts — shared rusqlite connection helpers (closeNativeDb, reopenNativeDb, suspendNativeDb, refreshJsDb). This breaks the function-level cycle 'buildGraph <-> tryNativeOrchestrator' caused by codegraph's name-based resolver conflating the local buildGraph function with the ctx.nativeDb.buildGraph() method call. Once the orchestrator lives in its own file, there is no longer a local buildGraph in scope to collide with the method invocation. Function-level cycles: 9 -> 5. No file-level cycle introduced (still 1, unchanged — pre-existing MCP cycle). pipeline.ts shrinks from 1404 to 465 lines and now reads as a thin top-level controller: detect changes, try native, fall back to JS stages. computeWasmOnlyStaleFiles is re-exported from pipeline.ts so existing unit tests (tests/builder/wasm-only-stale-files.test.ts) keep working without changes. --- src/domain/graph/builder/pipeline.ts | 978 +----------------- .../builder/stages/native-db-lifecycle.ts | 74 ++ .../builder/stages/native-orchestrator.ts | 942 +++++++++++++++++ 3 files changed, 1035 insertions(+), 959 deletions(-) create mode 100644 src/domain/graph/builder/stages/native-db-lifecycle.ts create mode 100644 src/domain/graph/builder/stages/native-orchestrator.ts diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index 4dce8aa3d..ff4ee5e5d 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -8,52 +8,24 @@ import fs from 'node:fs'; import path from 'node:path'; import { performance } from 'node:perf_hooks'; import { - acquireAdvisoryLock, closeDb, closeDbPair, getBuildMeta, initSchema, MIGRATIONS, openDb, - purgeFilesData, - releaseAdvisoryLock, - setBuildMeta, } from '../../../db/index.js'; import { detectWorkspaces, loadConfig } from '../../../infrastructure/config.js'; import { debug, info, warn } from '../../../infrastructure/logger.js'; import { loadNative } from '../../../infrastructure/native.js'; -import { semverCompare } from '../../../infrastructure/update-check.js'; -import { normalizePath } from '../../../shared/constants.js'; import { toErrorMessage } from '../../../shared/errors.js'; import { CODEGRAPH_VERSION } from '../../../shared/version.js'; -import type { - BetterSqlite3Database, - BuildGraphOpts, - BuildResult, - Definition, - ExtractorOutput, - SqliteStatement, -} from '../../../types.js'; -import { - classifyNativeDrops, - formatDropExtensionSummary, - getActiveEngine, - getInstalledWasmExtensions, - NATIVE_SUPPORTED_EXTENSIONS, - parseFilesWasmForBackfill, -} from '../../parser.js'; +import type { BuildGraphOpts, BuildResult } from '../../../types.js'; +import { getActiveEngine } from '../../parser.js'; import { writeJournalHeader } from '../journal.js'; import { setWorkspaces } from '../resolve.js'; import { PipelineContext } from './context.js'; -import { - batchInsertNodes, - collectFiles as collectFilesUtil, - fileHash, - fileStat, - loadPathAliases, - readFileSafe, -} from './helpers.js'; -import { NativeDbProxy } from './native-db-proxy.js'; +import { loadPathAliases } from './helpers.js'; import { buildEdges } from './stages/build-edges.js'; import { buildStructure } from './stages/build-structure.js'; // Pipeline stages @@ -61,10 +33,24 @@ import { collectFiles } from './stages/collect-files.js'; import { detectChanges, detectNoChanges } from './stages/detect-changes.js'; import { finalize } from './stages/finalize.js'; import { insertNodes } from './stages/insert-nodes.js'; +import { + closeNativeDb, + refreshJsDb, + reopenNativeDb, + suspendNativeDb, +} from './stages/native-db-lifecycle.js'; +import { tryNativeOrchestrator } from './stages/native-orchestrator.js'; import { parseFiles } from './stages/parse-files.js'; import { resolveImports } from './stages/resolve-imports.js'; import { runAnalyses } from './stages/run-analyses.js'; +// Re-export computeWasmOnlyStaleFiles for backward compatibility with tests +// that import from this module path (#1073 unit tests). +export { + computeWasmOnlyStaleFiles, + type WasmOnlyStaleFilesInput, +} from './stages/native-orchestrator.js'; + // ── Setup helpers ─────────────────────────────────────────────────────── function initializeEngine(ctx: PipelineContext): void { @@ -237,934 +223,8 @@ function formatTimingResult(ctx: PipelineContext): BuildResult { }; } -// ── NativeDb lifecycle helpers ────────────────────────────────────────── - -/** Checkpoint WAL through rusqlite and close the native connection. */ -function closeNativeDb(ctx: PipelineContext, label: string): void { - if (!ctx.nativeDb) return; - try { - ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); - } catch (e) { - debug(`${label} WAL checkpoint failed: ${toErrorMessage(e)}`); - } - try { - ctx.nativeDb.close(); - } catch (e) { - debug(`${label} nativeDb close failed: ${toErrorMessage(e)}`); - } - ctx.nativeDb = undefined; -} - -/** Try to reopen the native connection for a given pipeline phase. */ -function reopenNativeDb(ctx: PipelineContext, label: string): void { - if ((ctx.opts.engine ?? 'auto') === 'wasm') return; - const native = loadNative(); - if (!native?.NativeDatabase) return; - try { - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - } catch (e) { - debug(`reopen nativeDb for ${label} failed: ${toErrorMessage(e)}`); - ctx.nativeDb = undefined; - } -} - -/** Close nativeDb and clear stale references in engineOpts. */ -function suspendNativeDb(ctx: PipelineContext, label: string): void { - closeNativeDb(ctx, label); - if (ctx.engineOpts?.nativeDb) { - ctx.engineOpts.nativeDb = undefined; - } -} - -/** - * After native writes, reopen the JS db connection to get a fresh page cache. - * Rusqlite WAL truncation invalidates better-sqlite3's internal WAL index, - * causing SQLITE_CORRUPT on the next read (#715, #736). - */ -function refreshJsDb(ctx: PipelineContext): void { - try { - ctx.db.close(); - } catch (e) { - debug(`refreshJsDb close failed: ${toErrorMessage(e)}`); - } - ctx.db = openDb(ctx.dbPath); -} - -// ── Native orchestrator types ────────────────────────────────────────── - -interface NativeOrchestratorResult { - phases: Record; - earlyExit?: boolean; - nodeCount?: number; - edgeCount?: number; - fileCount?: number; - changedFiles?: string[]; - changedCount?: number; - removedCount?: number; - isFullBuild?: boolean; - /** Whether the Rust pipeline handled the structure phase (small-incremental fast path). */ - structureHandled?: boolean; - /** Whether the Rust pipeline wrote AST/complexity/CFG/dataflow to DB. */ - analysisComplete?: boolean; -} - -// ── Native orchestrator helpers ─────────────────────────────────────── - -/** Determine whether the native orchestrator should be skipped. Returns a reason string, or null if it should run. */ -function shouldSkipNativeOrchestrator(ctx: PipelineContext): string | null { - if (ctx.forceFullRebuild) return 'forceFullRebuild'; - // v3.9.0 addon had buggy incremental purge (wrong SQL on analysis tables, - // scoped removal over-detection). Fixed in v3.9.1 by PR #865. Gate on - // < 3.9.1 so v3.9.1+ uses the fast Rust orchestrator path. - const orchestratorBuggy = !!ctx.engineVersion && semverCompare(ctx.engineVersion, '3.9.1') < 0; - if (orchestratorBuggy) return `buggy addon ${ctx.engineVersion}`; - if (ctx.engineName !== 'native') return `engine=${ctx.engineName}`; - return null; -} - -/** Checkpoint WAL through rusqlite, close nativeDb, and reopen better-sqlite3. - * Returns false if the DB reopen fails (caller should return partial result). */ -function handoffWalAfterNativeBuild(ctx: PipelineContext): boolean { - closeNativeDb(ctx, 'post-native-build'); - try { - ctx.db.close(); - } catch (e) { - debug(`handoffWal JS db close failed: ${toErrorMessage(e)}`); - } - try { - ctx.db = openDb(ctx.dbPath); - return true; - } catch (reopenErr) { - warn(`Failed to reopen DB after native build: ${(reopenErr as Error).message}`); - return false; - } -} - -/** - * Reconstruct fileSymbols from the DB after a native orchestrator build. - * When `scopeFiles` is provided, only loads those files (for analysis-only). - * When omitted, loads all files (needed for structure rebuilds). - */ -function reconstructFileSymbolsFromDb( - ctx: PipelineContext, - scopeFiles?: string[], -): Map { - let query = - 'SELECT file, name, kind, line, end_line as endLine FROM nodes WHERE file IS NOT NULL'; - const params: string[] = []; - if (scopeFiles && scopeFiles.length > 0) { - const placeholders = scopeFiles.map(() => '?').join(','); - query += ` AND file IN (${placeholders})`; - params.push(...scopeFiles); - } - query += ' ORDER BY file, line'; - - const rows = ctx.db.prepare(query).all(...params) as { - file: string; - name: string; - kind: string; - line: number; - endLine: number | null; - }[]; - - const fileSymbols = new Map(); - for (const row of rows) { - let entry = fileSymbols.get(row.file); - if (!entry) { - entry = { - definitions: [], - calls: [], - imports: [], - classes: [], - exports: [], - typeMap: new Map(), - }; - fileSymbols.set(row.file, entry); - } - entry.definitions.push({ - name: row.name, - kind: row.kind as Definition['kind'], - line: row.line, - endLine: row.endLine ?? undefined, - }); - } - - // Populate import/export counts from DB edges so buildStructure - // computes correct import_count/export_count in node_metrics. - // The extractor arrays aren't persisted to the DB, so we derive - // counts from edge data instead (#804). - const importCountRows = ctx.db - .prepare( - `SELECT n.file, COUNT(*) AS cnt - FROM edges e JOIN nodes n ON e.source_id = n.id - WHERE e.kind IN ('imports', 'imports-type', 'dynamic-imports') - AND n.file IS NOT NULL - GROUP BY n.file`, - ) - .all() as { file: string; cnt: number }[]; - for (const row of importCountRows) { - const entry = fileSymbols.get(row.file); - if (entry) entry.imports = new Array(row.cnt) as ExtractorOutput['imports']; - } - - const exportCountRows = ctx.db - .prepare( - `SELECT n_tgt.file, COUNT(DISTINCT n_tgt.id) AS cnt - FROM edges e - JOIN nodes n_tgt ON e.target_id = n_tgt.id - JOIN nodes n_src ON e.source_id = n_src.id - WHERE e.kind IN ('imports', 'imports-type', 'reexports') - AND n_tgt.file IS NOT NULL - AND n_src.file != n_tgt.file - GROUP BY n_tgt.file`, - ) - .all() as { file: string; cnt: number }[]; - for (const row of exportCountRows) { - const entry = fileSymbols.get(row.file); - if (entry) entry.exports = new Array(row.cnt) as ExtractorOutput['exports']; - } - - return fileSymbols; -} - -/** - * Run JS buildStructure() after native orchestrator to fill directory nodes + contains edges. - * For full builds, passes changedFiles=null (full rebuild). - * For incremental builds, passes the changed file list to scope the update. - */ -async function runPostNativeStructure( - ctx: PipelineContext, - allFileSymbols: Map, - isFullBuild: boolean, - changedFiles: string[] | undefined, -): Promise { - const structureStart = performance.now(); - try { - const directories = new Set(); - for (const relPath of allFileSymbols.keys()) { - const parts = relPath.split('/'); - for (let i = 1; i < parts.length; i++) { - directories.add(parts.slice(0, i).join('/')); - } - } - - const lineCountMap = new Map(); - const cachedLineCounts = ctx.db - .prepare( - `SELECT n.name AS file, m.line_count - FROM node_metrics m JOIN nodes n ON m.node_id = n.id - WHERE n.kind = 'file'`, - ) - .all() as Array<{ file: string; line_count: number }>; - for (const row of cachedLineCounts) { - lineCountMap.set(row.file, row.line_count); - } - - // Full builds need null (rebuild everything). Incremental builds pass the - // changed file list so buildStructure only updates those files' metrics - // and contains edges — matching the JS pipeline's medium-incremental path. - const changedFilePaths = isFullBuild || !changedFiles?.length ? null : changedFiles; - const { buildStructure: buildStructureFn } = (await import( - '../../../features/structure.js' - )) as { - buildStructure: ( - db: typeof ctx.db, - fileSymbols: Map, - rootDir: string, - lineCountMap: Map, - directories: Set, - changedFiles: string[] | null, - ) => void; - }; - buildStructureFn( - ctx.db, - allFileSymbols, - ctx.rootDir, - lineCountMap, - directories, - changedFilePaths, - ); - debug( - `Structure phase completed after native orchestrator${changedFilePaths ? ` (${changedFilePaths.length} files)` : ' (full)'}`, - ); - } catch (err) { - warn(`Structure phase failed after native build: ${toErrorMessage(err)}`); - } - return performance.now() - structureStart; -} - -/** - * JS fallback for AST/complexity/CFG/dataflow analysis after native orchestrator. - * Used when the Rust addon doesn't include analysis persistence (older addon - * version) or when analysis failed on the Rust side. - */ -async function runPostNativeAnalysis( - ctx: PipelineContext, - allFileSymbols: Map, - changedFiles: string[] | undefined, -): Promise<{ astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }> { - const timing = { astMs: 0, complexityMs: 0, cfgMs: 0, dataflowMs: 0 }; - - // Scope analysis fileSymbols to changed files only - let analysisFileSymbols: Map; - if (changedFiles && changedFiles.length > 0) { - analysisFileSymbols = new Map(); - for (const f of changedFiles) { - const entry = allFileSymbols.get(f); - if (entry) analysisFileSymbols.set(f, entry); - } - } else { - analysisFileSymbols = allFileSymbols; - } - - // Reopen nativeDb for analysis features (suspend/resume WAL pattern). - const native = loadNative(); - if (native?.NativeDatabase) { - try { - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - if (ctx.engineOpts) ctx.engineOpts.nativeDb = ctx.nativeDb; - } catch { - ctx.nativeDb = undefined; - if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; - } - } - - // Flush JS WAL pages once so Rust can see them, then no-op callbacks. - // Previously each feature called wal_checkpoint(TRUNCATE) individually - // (~68ms each × 3-4 features). One FULL checkpoint suffices. - if (ctx.nativeDb && ctx.engineOpts) { - ctx.db.pragma('wal_checkpoint(FULL)'); - ctx.engineOpts.suspendJsDb = () => {}; - ctx.engineOpts.resumeJsDb = () => {}; - } - - try { - const { runAnalyses: runAnalysesFn } = (await import('../../../ast-analysis/engine.js')) as { - runAnalyses: ( - db: BetterSqlite3Database, - fileSymbols: Map, - rootDir: string, - opts: Record, - engineOpts?: Record, - ) => Promise<{ astMs?: number; complexityMs?: number; cfgMs?: number; dataflowMs?: number }>; - }; - const result = await runAnalysesFn( - ctx.db, - analysisFileSymbols, - ctx.rootDir, - ctx.opts as Record, - ctx.engineOpts as unknown as Record | undefined, - ); - timing.astMs = result.astMs ?? 0; - timing.complexityMs = result.complexityMs ?? 0; - timing.cfgMs = result.cfgMs ?? 0; - timing.dataflowMs = result.dataflowMs ?? 0; - } catch (err) { - warn(`Analysis phases failed after native build: ${toErrorMessage(err)}`); - } - - // Close nativeDb after analyses — TRUNCATE checkpoint flushes all Rust - // WAL writes so JS and external readers can see them. Runs once after - // all analysis features complete (not per-feature). - if (ctx.nativeDb) { - try { - ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); - } catch { - /* ignore checkpoint errors */ - } - try { - ctx.nativeDb.close(); - } catch { - /* ignore close errors */ - } - ctx.nativeDb = undefined; - if (ctx.engineOpts) { - ctx.engineOpts.nativeDb = undefined; - ctx.engineOpts.suspendJsDb = undefined; - ctx.engineOpts.resumeJsDb = undefined; - } - } - - return timing; -} - -/** Format timing result from native orchestrator phases + JS post-processing. */ -function formatNativeTimingResult( - p: Record, - structurePatchMs: number, - analysisTiming: { astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }, -): BuildResult { - return { - phases: { - setupMs: +(p.setupMs ?? 0).toFixed(1), - collectMs: +(p.collectMs ?? 0).toFixed(1), - detectMs: +(p.detectMs ?? 0).toFixed(1), - parseMs: +(p.parseMs ?? 0).toFixed(1), - insertMs: +(p.insertMs ?? 0).toFixed(1), - resolveMs: +(p.resolveMs ?? 0).toFixed(1), - edgesMs: +(p.edgesMs ?? 0).toFixed(1), - structureMs: +((p.structureMs ?? 0) + structurePatchMs).toFixed(1), - rolesMs: +(p.rolesMs ?? 0).toFixed(1), - astMs: +(analysisTiming.astMs ?? 0).toFixed(1), - complexityMs: +(analysisTiming.complexityMs ?? 0).toFixed(1), - cfgMs: +(analysisTiming.cfgMs ?? 0).toFixed(1), - dataflowMs: +(analysisTiming.dataflowMs ?? 0).toFixed(1), - finalizeMs: +(p.finalizeMs ?? 0).toFixed(1), - }, - }; -} - -/** Try the native build orchestrator. Returns a BuildResult on success, undefined to fall through to JS pipeline. */ -async function tryNativeOrchestrator( - ctx: PipelineContext, -): Promise { - const skipReason = shouldSkipNativeOrchestrator(ctx); - if (skipReason) { - debug(`Skipping native orchestrator: ${skipReason}`); - return undefined; - } - - // Open NativeDatabase on demand — deferred from setupPipeline to skip the - // ~60ms cost on no-op/early-exit builds. Close the better-sqlite3 connection - // first to avoid dual-connection WAL corruption. - if (!ctx.nativeDb && ctx.nativeAvailable) { - const native = loadNative(); - if (native?.NativeDatabase) { - try { - // Close better-sqlite3 before opening rusqlite to avoid WAL conflicts. - // Uses raw close() instead of closeDb() intentionally — the advisory lock - // is kept and transferred to the NativeDbProxy below, not released here. - ctx.db.close(); - acquireAdvisoryLock(ctx.dbPath); - ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); - ctx.nativeDb.initSchema(); - // Replace ctx.db with a NativeDbProxy so post-native JS fallback - // (structure, analysis) can use it without reopening better-sqlite3. - const proxy = new NativeDbProxy(ctx.nativeDb); - proxy.__lockPath = `${ctx.dbPath}.lock`; - ctx.db = proxy as unknown as typeof ctx.db; - ctx.nativeFirstProxy = true; - } catch (err) { - warn(`NativeDatabase setup failed, falling back to JS: ${toErrorMessage(err)}`); - try { - ctx.nativeDb?.close(); - } catch (e) { - debug(`tryNativeOrchestrator: close failed during fallback: ${toErrorMessage(e)}`); - } - ctx.nativeDb = undefined; - ctx.nativeFirstProxy = false; // defensive: reset in case future refactors move the assignment above throwing lines - releaseAdvisoryLock(`${ctx.dbPath}.lock`); - // Reopen better-sqlite3 for JS pipeline fallback - ctx.db = openDb(ctx.dbPath); - } - } - } - - if (!ctx.nativeDb?.buildGraph) return undefined; - - const resultJson = ctx.nativeDb.buildGraph( - ctx.rootDir, - JSON.stringify(ctx.config), - JSON.stringify(ctx.aliases), - JSON.stringify(ctx.opts), - ); - const result = JSON.parse(resultJson) as NativeOrchestratorResult; - - if (result.earlyExit) { - info('No changes detected'); - // Even on no-op rebuilds, dropped-language files added since the last - // full build are still missing from `nodes`/`file_hashes` (#1083), and - // WASM-only files deleted from disk leave stale rows behind (#1073). - // The orchestrator's file_collector skipped them, so its earlyExit - // doesn't imply DB consistency. Run the gap repair before returning. - const gap = detectDroppedLanguageGap(ctx); - if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) { - await backfillNativeDroppedFiles(ctx, gap); - } - closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); - return 'early-exit'; - } - - // Log incremental status to match JS pipeline output - const changed = result.changedCount ?? 0; - const removed = result.removedCount ?? 0; - if (!result.isFullBuild && (changed > 0 || removed > 0)) { - info(`Incremental: ${changed} changed, ${removed} removed`); - } - - const p = result.phases; - - // Sync build_meta so JS-side version/engine checks work on next build. - // Use the binary's CARGO_PKG_VERSION (ctx.nativeBinaryVersion), not the - // platform package.json version (ctx.engineVersion). The Rust side's - // check_version_mismatch compares against CARGO_PKG_VERSION; writing - // the package.json value would create a permanent mismatch whenever - // the binary and platform package.json diverge — e.g., CI hot-swap - // via ci-install-native.mjs (#1066) — forcing every subsequent build - // to be a full rebuild. - // - // When the native addon doesn't expose engineVersion() (older addon), - // fall back to CODEGRAPH_VERSION — same fallback used by both - // checkEngineSchemaMismatch (read path) and persistBuildMetadata - // (the JS-pipeline write path in finalize.ts). Using ctx.engineVersion - // here would re-introduce the asymmetry this PR fixes for that case. - const nativeVersionForMeta = ctx.nativeBinaryVersion || CODEGRAPH_VERSION; - setBuildMeta(ctx.db, { - engine: ctx.engineName, - engine_version: nativeVersionForMeta, - codegraph_version: nativeVersionForMeta, - schema_version: String(ctx.schemaVersion), - built_at: new Date().toISOString(), - }); - - info( - `Native build orchestrator completed: ${result.nodeCount ?? 0} nodes, ${result.edgeCount ?? 0} edges, ${result.fileCount ?? 0} files`, - ); - - // ── Post-native structure + analysis ────────────────────────────── - let analysisTiming = { - astMs: +(p.astMs ?? 0), - complexityMs: +(p.complexityMs ?? 0), - cfgMs: +(p.cfgMs ?? 0), - dataflowMs: +(p.dataflowMs ?? 0), - }; - let structurePatchMs = 0; - // Skip JS structure when the Rust pipeline's small-incremental fast path - // already handled it. For full builds and large incrementals where Rust - // skipped structure, we must run the JS fallback. - const needsStructure = !result.structureHandled; - // When the Rust addon doesn't include analysis persistence (older addon - // version or analysis failed), fall back to JS-side analysis. - const needsAnalysisFallback = - !result.analysisComplete && - (ctx.opts.ast !== false || - ctx.opts.complexity !== false || - ctx.opts.cfg !== false || - ctx.opts.dataflow !== false); - - if (needsStructure || needsAnalysisFallback) { - // When analysis fallback is needed, handoff to better-sqlite3 — the - // analysis engine uses the suspend/resume WAL pattern that requires a - // real better-sqlite3 connection, not the NativeDbProxy. - if (needsAnalysisFallback && ctx.nativeFirstProxy) { - closeNativeDb(ctx, 'pre-analysis-fallback'); - ctx.db = openDb(ctx.dbPath); - ctx.nativeFirstProxy = false; - } else if (!ctx.nativeFirstProxy && !handoffWalAfterNativeBuild(ctx)) { - // DB reopen failed — return partial result - return formatNativeTimingResult(p, 0, analysisTiming); - } - - const fileSymbols = reconstructFileSymbolsFromDb(ctx); - - if (needsStructure) { - structurePatchMs = await runPostNativeStructure( - ctx, - fileSymbols, - !!result.isFullBuild, - result.changedFiles, - ); - } - - if (needsAnalysisFallback) { - analysisTiming = await runPostNativeAnalysis(ctx, fileSymbols, result.changedFiles); - } - } - - // Engine parity: the native orchestrator silently drops files whose - // Rust extractor/grammar is missing or fails (e.g. HCL, Scala, Swift on - // stale native binaries). WASM handles those — backfill via WASM so both - // engines process the same file set (#967). - // - // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for - // both gating and the backfill itself. On dirty incrementals/full builds - // the orchestrator signals trigger backfill, so the walk happens once - // (instead of redundantly inside backfill). On quiet incrementals we - // still pay the walk so we can detect brand-new files in dropped-language - // extensions — a gap that the orchestrator's `detect_removed_files` - // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap - // because the expensive part (WASM re-parse of the missing set) is - // gated below. - const removedCount = result.removedCount ?? 0; - const changedCount = result.changedCount ?? 0; - const gap = detectDroppedLanguageGap(ctx); - if ( - result.isFullBuild || - removedCount > 0 || - changedCount > 0 || - gap.missingAbs.length > 0 || - gap.staleRel.length > 0 - ) { - await backfillNativeDroppedFiles(ctx, gap); - } - - closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); - return formatNativeTimingResult(p, structurePatchMs, analysisTiming); -} - -/** Files the native orchestrator silently dropped — the working set for backfill. */ -interface DroppedLanguageGap { - /** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */ - missingRel: string[]; - /** Absolute paths, aligned by index with `missingRel`. */ - missingAbs: string[]; - /** - * Relative paths of WASM-only files present in DB but absent from disk (#1073). - * Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side - * backfill must purge them. Always disjoint from `missingRel`. - */ - staleRel: string[]; -} - -/** - * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper - * is pure and unit-testable independently of `getInstalledWasmExtensions` and - * the `NATIVE_SUPPORTED_EXTENSIONS` global state. - */ -export interface WasmOnlyStaleFilesInput { - /** Distinct `file` values from the `nodes` table. */ - existingNodes: ReadonlySet; - /** Distinct `file` values from the `file_hashes` table. */ - existingHashes: ReadonlySet; - /** Relative paths currently on disk (from `collectFilesUtil`). */ - expected: ReadonlySet; - /** Lowercased extensions whose WASM grammar is installed. */ - installedExts: ReadonlySet; - /** Extensions covered by the Rust addon — Rust owns deletion for these. */ - nativeSupported: ReadonlySet; -} - -/** - * Compute the WASM-only files present in the DB but missing from disk (#1073). - * - * Returns relative paths that: - * - appear in `existingNodes` or `existingHashes` (in DB), - * - are absent from `expected` (not on disk), - * - have an extension installed for WASM, AND - * - have an extension NOT covered by `nativeSupported` — Rust's - * `purge_changed_files` handles deletion for natively-supported extensions - * via its own `detect_removed_files`, so the caller must not double-purge. - * - * Extensions are lowercased before lookup to match the registry and Rust's - * `LanguageKind::from_extension` (which normalises case for the languages - * where both cases are conventional, e.g. R's `.r` / `.R`). - * - * DB paths are forced to forward slashes before comparison with `expected` - * (which is always normalised). The on-disk invariant is that DB rows are - * written with forward slashes, but a stale row written by older code on - * Windows could carry back-slashes — normalising here makes the comparison - * platform-safe and prevents false-positive purges of live rows. We replace - * `\\` explicitly (rather than calling `normalizePath`, which only touches - * `path.sep`) so the defence works when running on POSIX against a DB that - * was migrated from Windows. - * - * Exported for unit testing. - */ -export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { - const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; - const stale: string[] = []; - const seen = new Set(); - const consider = (rawRel: string): void => { - const rel = rawRel.replace(/\\/g, '/'); - if (expected.has(rel) || seen.has(rel)) return; - const ext = path.extname(rel).toLowerCase(); - if (nativeSupported.has(ext)) return; - if (!installedExts.has(ext)) return; - seen.add(rel); - // Push the ORIGINAL raw path (not the normalised form) so the eventual - // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData` - // matches the actual stored row. The dedup `seen` set keeps the - // normalised form so a file written once with `\` and once with `/` - // is still treated as one entry — but the value the SQL sees has to - // be byte-identical to what's on disk in the DB. - stale.push(rawRel); - }; - for (const rel of existingNodes) consider(rel); - for (const rel of existingHashes) consider(rel); - return stale; -} - -/** - * Group relative paths by their lowercased extension. Shape matches the bucket - * type that `formatDropExtensionSummary` consumes, so callers can render a - * log-friendly per-extension summary without going through `classifyNativeDrops` - * when the reason is already known (e.g. the stale-purge path where every path - * is guaranteed `unsupported-by-native`). - */ -function groupByExtension(relPaths: Iterable): Map { - const buckets = new Map(); - for (const rel of relPaths) { - const ext = path.extname(rel).toLowerCase(); - let list = buckets.get(ext); - if (!list) { - list = []; - buckets.set(ext, list); - } - list.push(rel); - } - return buckets; -} - -/** - * Detect files the native orchestrator silently dropped. - * - * Walks the filesystem and compares against `nodes` + `file_hashes`. A file - * is "missing" if it's absent from EITHER table — both must be present for - * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g. - * legacy DBs where `nodes` was populated but `file_hashes` was not). - * - * Restricted to files with an installed WASM grammar; extensions in - * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal - * installs) can't be parsed by either engine, so they're not a native - * regression — excluding them keeps the warn count in - * `backfillNativeDroppedFiles` meaningful. - * - * Also detects WASM-only files deleted from disk (#1073). Rust's - * `detect_removed_files` filter (#1070) skips files outside its supported - * extensions, so deletions of WASM-only languages don't reach the native - * purge path; the rest of the backfill only inserts rows, so without this - * step stale `nodes`/`file_hashes` rows would linger across incremental - * rebuilds until the next full rebuild. - * - * Cheap (no DB handoff, no parsing): used both to gate the backfill call - * and as its working set. NativeDbProxy supports `.prepare().all()`, so - * this works whether `ctx.db` is a proxy or a real better-sqlite3 - * connection — letting us skip the close-native / reopen-better-sqlite3 - * cost when there's nothing to backfill. - */ -function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { - const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); - const expected = new Set( - collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), - ); - - const existingNodeRows = ctx.db - .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'") - .all() as Array<{ file: string }>; - const existingNodes = new Set(existingNodeRows.map((r) => r.file)); - - let existingHashes = new Set(); - try { - const existingHashRows = ctx.db - .prepare('SELECT DISTINCT file FROM file_hashes') - .all() as Array<{ file: string }>; - existingHashes = new Set(existingHashRows.map((r) => r.file)); - } catch (e) { - // file_hashes table may not exist on legacy DBs; treat as fully missing - // so the backfill writes rows on the upsert path below. - debug( - `detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, - ); - } - - const installedExts = getInstalledWasmExtensions(); - const missingRel: string[] = []; - const missingAbs: string[] = []; - for (const rel of expected) { - if (existingNodes.has(rel) && existingHashes.has(rel)) continue; - const ext = path.extname(rel).toLowerCase(); - if (!installedExts.has(ext)) continue; - missingRel.push(rel); - missingAbs.push(path.join(ctx.rootDir, rel)); - } - - const staleRel = computeWasmOnlyStaleFiles({ - existingNodes, - existingHashes, - expected, - installedExts, - nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, - }); - - return { missingRel, missingAbs, staleRel }; -} - -/** - * Backfill files that the native orchestrator silently dropped during parse. - * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). - * - * Also purges stale rows for WASM-only files deleted from disk (#1073), which - * Rust's `detect_removed_files` filter (#1070) skips. - * - * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller - * can use the same scan for both gating and the actual backfill — avoiding - * a redundant fs walk when the orchestrator's signals already triggered. - */ -async function backfillNativeDroppedFiles( - ctx: PipelineContext, - gap: DroppedLanguageGap, -): Promise { - const { missingRel, missingAbs, staleRel } = gap; - if (missingAbs.length === 0 && staleRel.length === 0) return; - - // Now that we know there's work to do, hand off to better-sqlite3 (needed - // for the INSERT path below). - if (ctx.nativeFirstProxy) { - closeNativeDb(ctx, 'pre-parity-backfill'); - ctx.db = openDb(ctx.dbPath); - ctx.nativeFirstProxy = false; - } - - const dbConn = ctx.db as unknown as BetterSqlite3Database; - - // Purge WASM-only files that were deleted from disk (#1073). Rust's - // detect_removed_files skips them and the insert path below never visits - // them, so without this their rows would persist across rebuilds until the - // next full rebuild reset the DB. - if (staleRel.length > 0) { - // `computeWasmOnlyStaleFiles` guarantees every path here has an extension - // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would - // always bucket 100% into `unsupported-by-native`. Build the extension - // summary directly to avoid a redundant classification pass. - const staleByExt = groupByExtension(staleRel); - info( - `Detected ${staleRel.length} deleted WASM-only file(s) across ${staleByExt.size} extension(s) the native orchestrator skipped; purging stale rows:${formatDropExtensionSummary(staleByExt)}`, - ); - purgeFilesData(dbConn, staleRel); - } - - if (missingAbs.length === 0) return; - - // Classify drops so users see per-extension reasons instead of just a count - // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust - // extractor); `native-extractor-failure` indicates a real native bug since - // the language IS supported by the addon yet the file was dropped anyway. - const { byReason, totals } = classifyNativeDrops(missingRel); - if (totals['unsupported-by-native'] > 0) { - const buckets = byReason['unsupported-by-native']; - info( - `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) across ${buckets.size} extension(s) in languages without a Rust extractor; backfilling via WASM:${formatDropExtensionSummary(buckets)}`, - ); - } - if (totals['native-extractor-failure'] > 0) { - const buckets = byReason['native-extractor-failure']; - warn( - `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) across ${buckets.size} extension(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM:${formatDropExtensionSummary(buckets)}`, - ); - } - const wasmResults = await parseFilesWasmForBackfill(missingAbs, ctx.rootDir); - - const rows: unknown[][] = []; - const exportKeys: unknown[][] = []; - for (const [relPath, symbols] of wasmResults) { - // File row — mirrors insertDefinitionsAndExports: qualified_name is null. - rows.push([relPath, 'file', relPath, 0, null, null, null, null, null]); - for (const def of symbols.definitions ?? []) { - // Populate qualified_name/scope the same way the JS fallback does so - // downstream queries (cross-file references, "go to definition") find - // these symbols. - const dotIdx = def.name.lastIndexOf('.'); - const scope = dotIdx !== -1 ? def.name.slice(0, dotIdx) : null; - rows.push([ - def.name, - def.kind, - relPath, - def.line, - def.endLine ?? null, - null, - def.name, - scope, - def.visibility ?? null, - ]); - } - // Exports: insert the row (INSERT OR IGNORE — a matching definition row - // is a no-op) and queue a key for the second-pass exported=1 update, so - // queries filtering on exported=1 find backfilled symbols (#970). - for (const exp of symbols.exports ?? []) { - rows.push([exp.name, exp.kind, relPath, exp.line, null, null, exp.name, null, null]); - exportKeys.push([exp.name, exp.kind, relPath, exp.line]); - } - } - const db = dbConn; - batchInsertNodes(db, rows); - - // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. - if (exportKeys.length > 0) { - const EXPORT_CHUNK = 500; - const exportStmtCache = new Map(); - for (let i = 0; i < exportKeys.length; i += EXPORT_CHUNK) { - const end = Math.min(i + EXPORT_CHUNK, exportKeys.length); - const chunkSize = end - i; - let updateStmt = exportStmtCache.get(chunkSize); - if (!updateStmt) { - const conditions = Array.from( - { length: chunkSize }, - () => '(name = ? AND kind = ? AND file = ? AND line = ?)', - ).join(' OR '); - updateStmt = db.prepare(`UPDATE nodes SET exported = 1 WHERE ${conditions}`); - exportStmtCache.set(chunkSize, updateStmt); - } - const vals: unknown[] = []; - for (let j = i; j < end; j++) { - const k = exportKeys[j] as unknown[]; - vals.push(k[0], k[1], k[2], k[3]); - } - updateStmt.run(...vals); - } - } - - // Persist file_hashes rows for every backfilled file. The Rust orchestrator - // only hashes files it parsed itself, so without this step files in - // optional-language extensions (e.g. .clj when no Rust extractor exists) - // would be missing from `file_hashes` — permanently breaking the JS-side - // fast-skip pre-flight (#1054), which rejects on `collected file missing - // from file_hashes` and forces every no-op rebuild back through the full - // ~2s native pipeline (#1068). - // - // Iterates `missingRel` (every collected file the Rust orchestrator - // dropped), not `wasmResults`, so files that produced zero symbols still - // get a row. - try { - const upsertHash = db.prepare( - 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', - ); - const writeHashes = db.transaction(() => { - for (let i = 0; i < missingRel.length; i++) { - const relPath = missingRel[i]; - const absPath = missingAbs[i]; - if (!relPath || !absPath) continue; - let code: string | null; - try { - code = readFileSafe(absPath); - } catch (e) { - debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`); - continue; - } - if (code === null) continue; - const stat = fileStat(absPath); - const mtime = stat ? stat.mtime : 0; - const size = stat ? stat.size : 0; - upsertHash.run(relPath, fileHash(code), mtime, size); - } - }); - writeHashes(); - } catch (e) { - debug( - `backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`, - ); - } - - // Free WASM parse trees from the inline backfill path (#1058). - // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree - // backed by WASM linear memory) on every result, but these symbols are - // consumed locally for DB row construction and never added to - // `ctx.allSymbols`, so the finalize-stage `releaseWasmTrees` sweep never - // sees them. Without this, trees leak WASM memory until process exit — - // bounded per run but cumulative across in-process integration tests. - // Mirrors the cleanup discipline established for #931. - for (const [, symbols] of wasmResults) { - const tree = (symbols as { _tree?: { delete?: () => void } })._tree; - if (tree && typeof tree.delete === 'function') { - try { - tree.delete(); - } catch { - /* ignore cleanup errors */ - } - } - (symbols as { _tree?: unknown; _langId?: unknown })._tree = undefined; - (symbols as { _tree?: unknown; _langId?: unknown })._langId = undefined; - } -} +// Native db lifecycle and orchestrator helpers live in dedicated stage +// modules — see `./stages/native-db-lifecycle.ts` and `./stages/native-orchestrator.ts`. // ── Pipeline stages execution ─────────────────────────────────────────── diff --git a/src/domain/graph/builder/stages/native-db-lifecycle.ts b/src/domain/graph/builder/stages/native-db-lifecycle.ts new file mode 100644 index 000000000..ac9e2568f --- /dev/null +++ b/src/domain/graph/builder/stages/native-db-lifecycle.ts @@ -0,0 +1,74 @@ +/** + * NativeDatabase connection lifecycle helpers. + * + * The Rust orchestrator and the JS pipeline stages both juggle the same + * `nativeDb` handle (rusqlite) alongside `ctx.db` (better-sqlite3). These + * helpers centralise the open/close/reopen sequence so both call sites + * preserve the same WAL-safety invariants: + * + * - Always checkpoint WAL before closing rusqlite — otherwise better-sqlite3's + * internal WAL index can drift and surface as SQLITE_CORRUPT on the next + * read (#715, #736). + * - Always reopen better-sqlite3 after rusqlite writes to drop the stale + * page cache. + * + * Lives in its own module so `tryNativeOrchestrator` (in `native-orchestrator.ts`) + * and the JS pipeline stages driver (in `pipeline.ts`) can share the helpers + * without either file importing the other. + */ +import { openDb } from '../../../../db/index.js'; +import { debug } from '../../../../infrastructure/logger.js'; +import { loadNative } from '../../../../infrastructure/native.js'; +import { toErrorMessage } from '../../../../shared/errors.js'; +import type { PipelineContext } from '../context.js'; + +/** Checkpoint WAL through rusqlite and close the native connection. */ +export function closeNativeDb(ctx: PipelineContext, label: string): void { + if (!ctx.nativeDb) return; + try { + ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); + } catch (e) { + debug(`${label} WAL checkpoint failed: ${toErrorMessage(e)}`); + } + try { + ctx.nativeDb.close(); + } catch (e) { + debug(`${label} nativeDb close failed: ${toErrorMessage(e)}`); + } + ctx.nativeDb = undefined; +} + +/** Try to reopen the native connection for a given pipeline phase. */ +export function reopenNativeDb(ctx: PipelineContext, label: string): void { + if ((ctx.opts.engine ?? 'auto') === 'wasm') return; + const native = loadNative(); + if (!native?.NativeDatabase) return; + try { + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + } catch (e) { + debug(`reopen nativeDb for ${label} failed: ${toErrorMessage(e)}`); + ctx.nativeDb = undefined; + } +} + +/** Close nativeDb and clear stale references in engineOpts. */ +export function suspendNativeDb(ctx: PipelineContext, label: string): void { + closeNativeDb(ctx, label); + if (ctx.engineOpts?.nativeDb) { + ctx.engineOpts.nativeDb = undefined; + } +} + +/** + * After native writes, reopen the JS db connection to get a fresh page cache. + * Rusqlite WAL truncation invalidates better-sqlite3's internal WAL index, + * causing SQLITE_CORRUPT on the next read (#715, #736). + */ +export function refreshJsDb(ctx: PipelineContext): void { + try { + ctx.db.close(); + } catch (e) { + debug(`refreshJsDb close failed: ${toErrorMessage(e)}`); + } + ctx.db = openDb(ctx.dbPath); +} diff --git a/src/domain/graph/builder/stages/native-orchestrator.ts b/src/domain/graph/builder/stages/native-orchestrator.ts new file mode 100644 index 000000000..934dd8d05 --- /dev/null +++ b/src/domain/graph/builder/stages/native-orchestrator.ts @@ -0,0 +1,942 @@ +/** + * Native build orchestrator stage — runs the full Rust pipeline when available, + * with WASM fallback for files the native engine drops. + * + * Extracted from `pipeline.ts` to break the name-collision cycle between + * `buildGraph()` (this module's caller) and `ctx.nativeDb.buildGraph()` (the + * Rust orchestrator entry point invoked here). Codegraph's name-based call + * resolver previously conflated the two and reported a false-positive + * function-level cycle (`buildGraph ↔ tryNativeOrchestrator`). + * + * The orchestrator-selection strategy lives here so `pipeline.ts` stays a thin + * top-level controller: detect changes, try native, fall back to JS stages. + */ +import path from 'node:path'; +import { performance } from 'node:perf_hooks'; +import { + acquireAdvisoryLock, + closeDbPair, + openDb, + purgeFilesData, + releaseAdvisoryLock, + setBuildMeta, +} from '../../../../db/index.js'; +import { debug, info, warn } from '../../../../infrastructure/logger.js'; +import { loadNative } from '../../../../infrastructure/native.js'; +import { semverCompare } from '../../../../infrastructure/update-check.js'; +import { normalizePath } from '../../../../shared/constants.js'; +import { toErrorMessage } from '../../../../shared/errors.js'; +import { CODEGRAPH_VERSION } from '../../../../shared/version.js'; +import type { + BetterSqlite3Database, + BuildResult, + Definition, + ExtractorOutput, + SqliteStatement, +} from '../../../../types.js'; +import { + classifyNativeDrops, + formatDropExtensionSummary, + getInstalledWasmExtensions, + NATIVE_SUPPORTED_EXTENSIONS, + parseFilesWasmForBackfill, +} from '../../../parser.js'; +import type { PipelineContext } from '../context.js'; +import { + batchInsertNodes, + collectFiles as collectFilesUtil, + fileHash, + fileStat, + readFileSafe, +} from '../helpers.js'; +import { NativeDbProxy } from '../native-db-proxy.js'; +import { closeNativeDb } from './native-db-lifecycle.js'; + +// ── Native orchestrator types ────────────────────────────────────────── + +interface NativeOrchestratorResult { + phases: Record; + earlyExit?: boolean; + nodeCount?: number; + edgeCount?: number; + fileCount?: number; + changedFiles?: string[]; + changedCount?: number; + removedCount?: number; + isFullBuild?: boolean; + /** Whether the Rust pipeline handled the structure phase (small-incremental fast path). */ + structureHandled?: boolean; + /** Whether the Rust pipeline wrote AST/complexity/CFG/dataflow to DB. */ + analysisComplete?: boolean; +} + +/** Files the native orchestrator silently dropped — the working set for backfill. */ +interface DroppedLanguageGap { + /** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */ + missingRel: string[]; + /** Absolute paths, aligned by index with `missingRel`. */ + missingAbs: string[]; + /** + * Relative paths of WASM-only files present in DB but absent from disk (#1073). + * Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side + * backfill must purge them. Always disjoint from `missingRel`. + */ + staleRel: string[]; +} + +/** + * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper + * is pure and unit-testable independently of `getInstalledWasmExtensions` and + * the `NATIVE_SUPPORTED_EXTENSIONS` global state. + */ +export interface WasmOnlyStaleFilesInput { + /** Distinct `file` values from the `nodes` table. */ + existingNodes: ReadonlySet; + /** Distinct `file` values from the `file_hashes` table. */ + existingHashes: ReadonlySet; + /** Relative paths currently on disk (from `collectFilesUtil`). */ + expected: ReadonlySet; + /** Lowercased extensions whose WASM grammar is installed. */ + installedExts: ReadonlySet; + /** Extensions covered by the Rust addon — Rust owns deletion for these. */ + nativeSupported: ReadonlySet; +} + +// ── Native orchestrator helpers ─────────────────────────────────────── + +/** Determine whether the native orchestrator should be skipped. Returns a reason string, or null if it should run. */ +function shouldSkipNativeOrchestrator(ctx: PipelineContext): string | null { + if (ctx.forceFullRebuild) return 'forceFullRebuild'; + // v3.9.0 addon had buggy incremental purge (wrong SQL on analysis tables, + // scoped removal over-detection). Fixed in v3.9.1 by PR #865. Gate on + // < 3.9.1 so v3.9.1+ uses the fast Rust orchestrator path. + const orchestratorBuggy = !!ctx.engineVersion && semverCompare(ctx.engineVersion, '3.9.1') < 0; + if (orchestratorBuggy) return `buggy addon ${ctx.engineVersion}`; + if (ctx.engineName !== 'native') return `engine=${ctx.engineName}`; + return null; +} + +/** Checkpoint WAL through rusqlite, close nativeDb, and reopen better-sqlite3. + * Returns false if the DB reopen fails (caller should return partial result). */ +function handoffWalAfterNativeBuild(ctx: PipelineContext): boolean { + closeNativeDb(ctx, 'post-native-build'); + try { + ctx.db.close(); + } catch (e) { + debug(`handoffWal JS db close failed: ${toErrorMessage(e)}`); + } + try { + ctx.db = openDb(ctx.dbPath); + return true; + } catch (reopenErr) { + warn(`Failed to reopen DB after native build: ${(reopenErr as Error).message}`); + return false; + } +} + +/** + * Reconstruct fileSymbols from the DB after a native orchestrator build. + * When `scopeFiles` is provided, only loads those files (for analysis-only). + * When omitted, loads all files (needed for structure rebuilds). + */ +function reconstructFileSymbolsFromDb( + ctx: PipelineContext, + scopeFiles?: string[], +): Map { + let query = + 'SELECT file, name, kind, line, end_line as endLine FROM nodes WHERE file IS NOT NULL'; + const params: string[] = []; + if (scopeFiles && scopeFiles.length > 0) { + const placeholders = scopeFiles.map(() => '?').join(','); + query += ` AND file IN (${placeholders})`; + params.push(...scopeFiles); + } + query += ' ORDER BY file, line'; + + const rows = ctx.db.prepare(query).all(...params) as { + file: string; + name: string; + kind: string; + line: number; + endLine: number | null; + }[]; + + const fileSymbols = new Map(); + for (const row of rows) { + let entry = fileSymbols.get(row.file); + if (!entry) { + entry = { + definitions: [], + calls: [], + imports: [], + classes: [], + exports: [], + typeMap: new Map(), + }; + fileSymbols.set(row.file, entry); + } + entry.definitions.push({ + name: row.name, + kind: row.kind as Definition['kind'], + line: row.line, + endLine: row.endLine ?? undefined, + }); + } + + // Populate import/export counts from DB edges so buildStructure + // computes correct import_count/export_count in node_metrics. + // The extractor arrays aren't persisted to the DB, so we derive + // counts from edge data instead (#804). + const importCountRows = ctx.db + .prepare( + `SELECT n.file, COUNT(*) AS cnt + FROM edges e JOIN nodes n ON e.source_id = n.id + WHERE e.kind IN ('imports', 'imports-type', 'dynamic-imports') + AND n.file IS NOT NULL + GROUP BY n.file`, + ) + .all() as { file: string; cnt: number }[]; + for (const row of importCountRows) { + const entry = fileSymbols.get(row.file); + if (entry) entry.imports = new Array(row.cnt) as ExtractorOutput['imports']; + } + + const exportCountRows = ctx.db + .prepare( + `SELECT n_tgt.file, COUNT(DISTINCT n_tgt.id) AS cnt + FROM edges e + JOIN nodes n_tgt ON e.target_id = n_tgt.id + JOIN nodes n_src ON e.source_id = n_src.id + WHERE e.kind IN ('imports', 'imports-type', 'reexports') + AND n_tgt.file IS NOT NULL + AND n_src.file != n_tgt.file + GROUP BY n_tgt.file`, + ) + .all() as { file: string; cnt: number }[]; + for (const row of exportCountRows) { + const entry = fileSymbols.get(row.file); + if (entry) entry.exports = new Array(row.cnt) as ExtractorOutput['exports']; + } + + return fileSymbols; +} + +/** + * Run JS buildStructure() after native orchestrator to fill directory nodes + contains edges. + * For full builds, passes changedFiles=null (full rebuild). + * For incremental builds, passes the changed file list to scope the update. + */ +async function runPostNativeStructure( + ctx: PipelineContext, + allFileSymbols: Map, + isFullBuild: boolean, + changedFiles: string[] | undefined, +): Promise { + const structureStart = performance.now(); + try { + const directories = new Set(); + for (const relPath of allFileSymbols.keys()) { + const parts = relPath.split('/'); + for (let i = 1; i < parts.length; i++) { + directories.add(parts.slice(0, i).join('/')); + } + } + + const lineCountMap = new Map(); + const cachedLineCounts = ctx.db + .prepare( + `SELECT n.name AS file, m.line_count + FROM node_metrics m JOIN nodes n ON m.node_id = n.id + WHERE n.kind = 'file'`, + ) + .all() as Array<{ file: string; line_count: number }>; + for (const row of cachedLineCounts) { + lineCountMap.set(row.file, row.line_count); + } + + // Full builds need null (rebuild everything). Incremental builds pass the + // changed file list so buildStructure only updates those files' metrics + // and contains edges — matching the JS pipeline's medium-incremental path. + const changedFilePaths = isFullBuild || !changedFiles?.length ? null : changedFiles; + const { buildStructure: buildStructureFn } = (await import( + '../../../../features/structure.js' + )) as { + buildStructure: ( + db: typeof ctx.db, + fileSymbols: Map, + rootDir: string, + lineCountMap: Map, + directories: Set, + changedFiles: string[] | null, + ) => void; + }; + buildStructureFn( + ctx.db, + allFileSymbols, + ctx.rootDir, + lineCountMap, + directories, + changedFilePaths, + ); + debug( + `Structure phase completed after native orchestrator${changedFilePaths ? ` (${changedFilePaths.length} files)` : ' (full)'}`, + ); + } catch (err) { + warn(`Structure phase failed after native build: ${toErrorMessage(err)}`); + } + return performance.now() - structureStart; +} + +/** + * JS fallback for AST/complexity/CFG/dataflow analysis after native orchestrator. + * Used when the Rust addon doesn't include analysis persistence (older addon + * version) or when analysis failed on the Rust side. + */ +async function runPostNativeAnalysis( + ctx: PipelineContext, + allFileSymbols: Map, + changedFiles: string[] | undefined, +): Promise<{ astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }> { + const timing = { astMs: 0, complexityMs: 0, cfgMs: 0, dataflowMs: 0 }; + + // Scope analysis fileSymbols to changed files only + let analysisFileSymbols: Map; + if (changedFiles && changedFiles.length > 0) { + analysisFileSymbols = new Map(); + for (const f of changedFiles) { + const entry = allFileSymbols.get(f); + if (entry) analysisFileSymbols.set(f, entry); + } + } else { + analysisFileSymbols = allFileSymbols; + } + + // Reopen nativeDb for analysis features (suspend/resume WAL pattern). + const native = loadNative(); + if (native?.NativeDatabase) { + try { + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + if (ctx.engineOpts) ctx.engineOpts.nativeDb = ctx.nativeDb; + } catch { + ctx.nativeDb = undefined; + if (ctx.engineOpts) ctx.engineOpts.nativeDb = undefined; + } + } + + // Flush JS WAL pages once so Rust can see them, then no-op callbacks. + // Previously each feature called wal_checkpoint(TRUNCATE) individually + // (~68ms each × 3-4 features). One FULL checkpoint suffices. + if (ctx.nativeDb && ctx.engineOpts) { + ctx.db.pragma('wal_checkpoint(FULL)'); + ctx.engineOpts.suspendJsDb = () => {}; + ctx.engineOpts.resumeJsDb = () => {}; + } + + try { + const { runAnalyses: runAnalysesFn } = (await import('../../../../ast-analysis/engine.js')) as { + runAnalyses: ( + db: BetterSqlite3Database, + fileSymbols: Map, + rootDir: string, + opts: Record, + engineOpts?: Record, + ) => Promise<{ astMs?: number; complexityMs?: number; cfgMs?: number; dataflowMs?: number }>; + }; + const result = await runAnalysesFn( + ctx.db, + analysisFileSymbols, + ctx.rootDir, + ctx.opts as Record, + ctx.engineOpts as unknown as Record | undefined, + ); + timing.astMs = result.astMs ?? 0; + timing.complexityMs = result.complexityMs ?? 0; + timing.cfgMs = result.cfgMs ?? 0; + timing.dataflowMs = result.dataflowMs ?? 0; + } catch (err) { + warn(`Analysis phases failed after native build: ${toErrorMessage(err)}`); + } + + // Close nativeDb after analyses — TRUNCATE checkpoint flushes all Rust + // WAL writes so JS and external readers can see them. Runs once after + // all analysis features complete (not per-feature). + if (ctx.nativeDb) { + try { + ctx.nativeDb.exec('PRAGMA wal_checkpoint(TRUNCATE)'); + } catch { + /* ignore checkpoint errors */ + } + try { + ctx.nativeDb.close(); + } catch { + /* ignore close errors */ + } + ctx.nativeDb = undefined; + if (ctx.engineOpts) { + ctx.engineOpts.nativeDb = undefined; + ctx.engineOpts.suspendJsDb = undefined; + ctx.engineOpts.resumeJsDb = undefined; + } + } + + return timing; +} + +/** Format timing result from native orchestrator phases + JS post-processing. */ +function formatNativeTimingResult( + p: Record, + structurePatchMs: number, + analysisTiming: { astMs: number; complexityMs: number; cfgMs: number; dataflowMs: number }, +): BuildResult { + return { + phases: { + setupMs: +(p.setupMs ?? 0).toFixed(1), + collectMs: +(p.collectMs ?? 0).toFixed(1), + detectMs: +(p.detectMs ?? 0).toFixed(1), + parseMs: +(p.parseMs ?? 0).toFixed(1), + insertMs: +(p.insertMs ?? 0).toFixed(1), + resolveMs: +(p.resolveMs ?? 0).toFixed(1), + edgesMs: +(p.edgesMs ?? 0).toFixed(1), + structureMs: +((p.structureMs ?? 0) + structurePatchMs).toFixed(1), + rolesMs: +(p.rolesMs ?? 0).toFixed(1), + astMs: +(analysisTiming.astMs ?? 0).toFixed(1), + complexityMs: +(analysisTiming.complexityMs ?? 0).toFixed(1), + cfgMs: +(analysisTiming.cfgMs ?? 0).toFixed(1), + dataflowMs: +(analysisTiming.dataflowMs ?? 0).toFixed(1), + finalizeMs: +(p.finalizeMs ?? 0).toFixed(1), + }, + }; +} + +/** + * Compute the WASM-only files present in the DB but missing from disk (#1073). + * + * Returns relative paths that: + * - appear in `existingNodes` or `existingHashes` (in DB), + * - are absent from `expected` (not on disk), + * - have an extension installed for WASM, AND + * - have an extension NOT covered by `nativeSupported` — Rust's + * `purge_changed_files` handles deletion for natively-supported extensions + * via its own `detect_removed_files`, so the caller must not double-purge. + * + * Extensions are lowercased before lookup to match the registry and Rust's + * `LanguageKind::from_extension` (which normalises case for the languages + * where both cases are conventional, e.g. R's `.r` / `.R`). + * + * DB paths are forced to forward slashes before comparison with `expected` + * (which is always normalised). The on-disk invariant is that DB rows are + * written with forward slashes, but a stale row written by older code on + * Windows could carry back-slashes — normalising here makes the comparison + * platform-safe and prevents false-positive purges of live rows. We replace + * `\\` explicitly (rather than calling `normalizePath`, which only touches + * `path.sep`) so the defence works when running on POSIX against a DB that + * was migrated from Windows. + * + * Exported for unit testing. + */ +export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { + const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; + const stale: string[] = []; + const seen = new Set(); + const consider = (rawRel: string): void => { + const rel = rawRel.replace(/\\/g, '/'); + if (expected.has(rel) || seen.has(rel)) return; + const ext = path.extname(rel).toLowerCase(); + if (nativeSupported.has(ext)) return; + if (!installedExts.has(ext)) return; + seen.add(rel); + // Push the ORIGINAL raw path (not the normalised form) so the eventual + // `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData` + // matches the actual stored row. The dedup `seen` set keeps the + // normalised form so a file written once with `\` and once with `/` + // is still treated as one entry — but the value the SQL sees has to + // be byte-identical to what's on disk in the DB. + stale.push(rawRel); + }; + for (const rel of existingNodes) consider(rel); + for (const rel of existingHashes) consider(rel); + return stale; +} + +/** + * Group relative paths by their lowercased extension. Shape matches the bucket + * type that `formatDropExtensionSummary` consumes, so callers can render a + * log-friendly per-extension summary without going through `classifyNativeDrops` + * when the reason is already known (e.g. the stale-purge path where every path + * is guaranteed `unsupported-by-native`). + */ +function groupByExtension(relPaths: Iterable): Map { + const buckets = new Map(); + for (const rel of relPaths) { + const ext = path.extname(rel).toLowerCase(); + let list = buckets.get(ext); + if (!list) { + list = []; + buckets.set(ext, list); + } + list.push(rel); + } + return buckets; +} + +/** + * Detect files the native orchestrator silently dropped. + * + * Walks the filesystem and compares against `nodes` + `file_hashes`. A file + * is "missing" if it's absent from EITHER table — both must be present for + * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g. + * legacy DBs where `nodes` was populated but `file_hashes` was not). + * + * Restricted to files with an installed WASM grammar; extensions in + * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal + * installs) can't be parsed by either engine, so they're not a native + * regression — excluding them keeps the warn count in + * `backfillNativeDroppedFiles` meaningful. + * + * Also detects WASM-only files deleted from disk (#1073). Rust's + * `detect_removed_files` filter (#1070) skips files outside its supported + * extensions, so deletions of WASM-only languages don't reach the native + * purge path; the rest of the backfill only inserts rows, so without this + * step stale `nodes`/`file_hashes` rows would linger across incremental + * rebuilds until the next full rebuild. + * + * Cheap (no DB handoff, no parsing): used both to gate the backfill call + * and as its working set. NativeDbProxy supports `.prepare().all()`, so + * this works whether `ctx.db` is a proxy or a real better-sqlite3 + * connection — letting us skip the close-native / reopen-better-sqlite3 + * cost when there's nothing to backfill. + */ +function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { + const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); + const expected = new Set( + collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), + ); + + const existingNodeRows = ctx.db + .prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'") + .all() as Array<{ file: string }>; + const existingNodes = new Set(existingNodeRows.map((r) => r.file)); + + let existingHashes = new Set(); + try { + const existingHashRows = ctx.db + .prepare('SELECT DISTINCT file FROM file_hashes') + .all() as Array<{ file: string }>; + existingHashes = new Set(existingHashRows.map((r) => r.file)); + } catch (e) { + // file_hashes table may not exist on legacy DBs; treat as fully missing + // so the backfill writes rows on the upsert path below. + debug( + `detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, + ); + } + + const installedExts = getInstalledWasmExtensions(); + const missingRel: string[] = []; + const missingAbs: string[] = []; + for (const rel of expected) { + if (existingNodes.has(rel) && existingHashes.has(rel)) continue; + const ext = path.extname(rel).toLowerCase(); + if (!installedExts.has(ext)) continue; + missingRel.push(rel); + missingAbs.push(path.join(ctx.rootDir, rel)); + } + + const staleRel = computeWasmOnlyStaleFiles({ + existingNodes, + existingHashes, + expected, + installedExts, + nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, + }); + + return { missingRel, missingAbs, staleRel }; +} + +/** + * Backfill files that the native orchestrator silently dropped during parse. + * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). + * + * Also purges stale rows for WASM-only files deleted from disk (#1073), which + * Rust's `detect_removed_files` filter (#1070) skips. + * + * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller + * can use the same scan for both gating and the actual backfill — avoiding + * a redundant fs walk when the orchestrator's signals already triggered. + */ +async function backfillNativeDroppedFiles( + ctx: PipelineContext, + gap: DroppedLanguageGap, +): Promise { + const { missingRel, missingAbs, staleRel } = gap; + if (missingAbs.length === 0 && staleRel.length === 0) return; + + // Now that we know there's work to do, hand off to better-sqlite3 (needed + // for the INSERT path below). + if (ctx.nativeFirstProxy) { + closeNativeDb(ctx, 'pre-parity-backfill'); + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } + + const dbConn = ctx.db as unknown as BetterSqlite3Database; + + // Purge WASM-only files that were deleted from disk (#1073). Rust's + // detect_removed_files skips them and the insert path below never visits + // them, so without this their rows would persist across rebuilds until the + // next full rebuild reset the DB. + if (staleRel.length > 0) { + // `computeWasmOnlyStaleFiles` guarantees every path here has an extension + // outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would + // always bucket 100% into `unsupported-by-native`. Build the extension + // summary directly to avoid a redundant classification pass. + const staleByExt = groupByExtension(staleRel); + info( + `Detected ${staleRel.length} deleted WASM-only file(s) across ${staleByExt.size} extension(s) the native orchestrator skipped; purging stale rows:${formatDropExtensionSummary(staleByExt)}`, + ); + purgeFilesData(dbConn, staleRel); + } + + if (missingAbs.length === 0) return; + + // Classify drops so users see per-extension reasons instead of just a count + // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust + // extractor); `native-extractor-failure` indicates a real native bug since + // the language IS supported by the addon yet the file was dropped anyway. + const { byReason, totals } = classifyNativeDrops(missingRel); + if (totals['unsupported-by-native'] > 0) { + const buckets = byReason['unsupported-by-native']; + info( + `Native orchestrator skipped ${totals['unsupported-by-native']} file(s) across ${buckets.size} extension(s) in languages without a Rust extractor; backfilling via WASM:${formatDropExtensionSummary(buckets)}`, + ); + } + if (totals['native-extractor-failure'] > 0) { + const buckets = byReason['native-extractor-failure']; + warn( + `Native orchestrator dropped ${totals['native-extractor-failure']} file(s) across ${buckets.size} extension(s) in natively-supported languages — likely a Rust extractor bug. Backfilling via WASM:${formatDropExtensionSummary(buckets)}`, + ); + } + const wasmResults = await parseFilesWasmForBackfill(missingAbs, ctx.rootDir); + + const rows: unknown[][] = []; + const exportKeys: unknown[][] = []; + for (const [relPath, symbols] of wasmResults) { + // File row — mirrors insertDefinitionsAndExports: qualified_name is null. + rows.push([relPath, 'file', relPath, 0, null, null, null, null, null]); + for (const def of symbols.definitions ?? []) { + // Populate qualified_name/scope the same way the JS fallback does so + // downstream queries (cross-file references, "go to definition") find + // these symbols. + const dotIdx = def.name.lastIndexOf('.'); + const scope = dotIdx !== -1 ? def.name.slice(0, dotIdx) : null; + rows.push([ + def.name, + def.kind, + relPath, + def.line, + def.endLine ?? null, + null, + def.name, + scope, + def.visibility ?? null, + ]); + } + // Exports: insert the row (INSERT OR IGNORE — a matching definition row + // is a no-op) and queue a key for the second-pass exported=1 update, so + // queries filtering on exported=1 find backfilled symbols (#970). + for (const exp of symbols.exports ?? []) { + rows.push([exp.name, exp.kind, relPath, exp.line, null, null, exp.name, null, null]); + exportKeys.push([exp.name, exp.kind, relPath, exp.line]); + } + } + const db = dbConn; + batchInsertNodes(db, rows); + + // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. + if (exportKeys.length > 0) { + const EXPORT_CHUNK = 500; + const exportStmtCache = new Map(); + for (let i = 0; i < exportKeys.length; i += EXPORT_CHUNK) { + const end = Math.min(i + EXPORT_CHUNK, exportKeys.length); + const chunkSize = end - i; + let updateStmt = exportStmtCache.get(chunkSize); + if (!updateStmt) { + const conditions = Array.from( + { length: chunkSize }, + () => '(name = ? AND kind = ? AND file = ? AND line = ?)', + ).join(' OR '); + updateStmt = db.prepare(`UPDATE nodes SET exported = 1 WHERE ${conditions}`); + exportStmtCache.set(chunkSize, updateStmt); + } + const vals: unknown[] = []; + for (let j = i; j < end; j++) { + const k = exportKeys[j] as unknown[]; + vals.push(k[0], k[1], k[2], k[3]); + } + updateStmt.run(...vals); + } + } + + // Persist file_hashes rows for every backfilled file. The Rust orchestrator + // only hashes files it parsed itself, so without this step files in + // optional-language extensions (e.g. .clj when no Rust extractor exists) + // would be missing from `file_hashes` — permanently breaking the JS-side + // fast-skip pre-flight (#1054), which rejects on `collected file missing + // from file_hashes` and forces every no-op rebuild back through the full + // ~2s native pipeline (#1068). + // + // Iterates `missingRel` (every collected file the Rust orchestrator + // dropped), not `wasmResults`, so files that produced zero symbols still + // get a row. + try { + const upsertHash = db.prepare( + 'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)', + ); + const writeHashes = db.transaction(() => { + for (let i = 0; i < missingRel.length; i++) { + const relPath = missingRel[i]; + const absPath = missingAbs[i]; + if (!relPath || !absPath) continue; + let code: string | null; + try { + code = readFileSafe(absPath); + } catch (e) { + debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`); + continue; + } + if (code === null) continue; + const stat = fileStat(absPath); + const mtime = stat ? stat.mtime : 0; + const size = stat ? stat.size : 0; + upsertHash.run(relPath, fileHash(code), mtime, size); + } + }); + writeHashes(); + } catch (e) { + debug( + `backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`, + ); + } + + // Free WASM parse trees from the inline backfill path (#1058). + // `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree + // backed by WASM linear memory) on every result, but these symbols are + // consumed locally for DB row construction and never added to + // `ctx.allSymbols`, so the finalize-stage `releaseWasmTrees` sweep never + // sees them. Without this, trees leak WASM memory until process exit — + // bounded per run but cumulative across in-process integration tests. + // Mirrors the cleanup discipline established for #931. + for (const [, symbols] of wasmResults) { + const tree = (symbols as { _tree?: { delete?: () => void } })._tree; + if (tree && typeof tree.delete === 'function') { + try { + tree.delete(); + } catch { + /* ignore cleanup errors */ + } + } + (symbols as { _tree?: unknown; _langId?: unknown })._tree = undefined; + (symbols as { _tree?: unknown; _langId?: unknown })._langId = undefined; + } +} + +/** + * Try the native build orchestrator. + * + * Returns: + * - `BuildResult` on success (caller should return it directly). + * - `'early-exit'` when the orchestrator detected no changes (caller should return undefined). + * - `undefined` when native is unavailable or skipped (caller should fall through to the JS pipeline). + * + * Encapsulates the orchestrator-selection strategy: open `NativeDatabase`, + * invoke `nativeDb.buildGraph()` (the Rust pipeline), and run post-native + * structure + analysis fallbacks. Lives in its own file to keep the Rust + * orchestrator entry point separated from the JS-side `buildGraph()` driver + * in `pipeline.ts`. + */ +export async function tryNativeOrchestrator( + ctx: PipelineContext, +): Promise { + const skipReason = shouldSkipNativeOrchestrator(ctx); + if (skipReason) { + debug(`Skipping native orchestrator: ${skipReason}`); + return undefined; + } + + // Open NativeDatabase on demand — deferred from setupPipeline to skip the + // ~60ms cost on no-op/early-exit builds. Close the better-sqlite3 connection + // first to avoid dual-connection WAL corruption. + if (!ctx.nativeDb && ctx.nativeAvailable) { + const native = loadNative(); + if (native?.NativeDatabase) { + try { + // Close better-sqlite3 before opening rusqlite to avoid WAL conflicts. + // Uses raw close() instead of closeDb() intentionally — the advisory lock + // is kept and transferred to the NativeDbProxy below, not released here. + ctx.db.close(); + acquireAdvisoryLock(ctx.dbPath); + ctx.nativeDb = native.NativeDatabase.openReadWrite(ctx.dbPath); + ctx.nativeDb.initSchema(); + // Replace ctx.db with a NativeDbProxy so post-native JS fallback + // (structure, analysis) can use it without reopening better-sqlite3. + const proxy = new NativeDbProxy(ctx.nativeDb); + proxy.__lockPath = `${ctx.dbPath}.lock`; + ctx.db = proxy as unknown as typeof ctx.db; + ctx.nativeFirstProxy = true; + } catch (err) { + warn(`NativeDatabase setup failed, falling back to JS: ${toErrorMessage(err)}`); + try { + ctx.nativeDb?.close(); + } catch (e) { + debug(`tryNativeOrchestrator: close failed during fallback: ${toErrorMessage(e)}`); + } + ctx.nativeDb = undefined; + ctx.nativeFirstProxy = false; // defensive: reset in case future refactors move the assignment above throwing lines + releaseAdvisoryLock(`${ctx.dbPath}.lock`); + // Reopen better-sqlite3 for JS pipeline fallback + ctx.db = openDb(ctx.dbPath); + } + } + } + + if (!ctx.nativeDb?.buildGraph) return undefined; + + const resultJson = ctx.nativeDb.buildGraph( + ctx.rootDir, + JSON.stringify(ctx.config), + JSON.stringify(ctx.aliases), + JSON.stringify(ctx.opts), + ); + const result = JSON.parse(resultJson) as NativeOrchestratorResult; + + if (result.earlyExit) { + info('No changes detected'); + // Even on no-op rebuilds, dropped-language files added since the last + // full build are still missing from `nodes`/`file_hashes` (#1083), and + // WASM-only files deleted from disk leave stale rows behind (#1073). + // The orchestrator's file_collector skipped them, so its earlyExit + // doesn't imply DB consistency. Run the gap repair before returning. + const gap = detectDroppedLanguageGap(ctx); + if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) { + await backfillNativeDroppedFiles(ctx, gap); + } + closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); + return 'early-exit'; + } + + // Log incremental status to match JS pipeline output + const changed = result.changedCount ?? 0; + const removed = result.removedCount ?? 0; + if (!result.isFullBuild && (changed > 0 || removed > 0)) { + info(`Incremental: ${changed} changed, ${removed} removed`); + } + + const p = result.phases; + + // Sync build_meta so JS-side version/engine checks work on next build. + // Use the binary's CARGO_PKG_VERSION (ctx.nativeBinaryVersion), not the + // platform package.json version (ctx.engineVersion). The Rust side's + // check_version_mismatch compares against CARGO_PKG_VERSION; writing + // the package.json value would create a permanent mismatch whenever + // the binary and platform package.json diverge — e.g., CI hot-swap + // via ci-install-native.mjs (#1066) — forcing every subsequent build + // to be a full rebuild. + // + // When the native addon doesn't expose engineVersion() (older addon), + // fall back to CODEGRAPH_VERSION — same fallback used by both + // checkEngineSchemaMismatch (read path) and persistBuildMetadata + // (the JS-pipeline write path in finalize.ts). Using ctx.engineVersion + // here would re-introduce the asymmetry this PR fixes for that case. + const nativeVersionForMeta = ctx.nativeBinaryVersion || CODEGRAPH_VERSION; + setBuildMeta(ctx.db, { + engine: ctx.engineName, + engine_version: nativeVersionForMeta, + codegraph_version: nativeVersionForMeta, + schema_version: String(ctx.schemaVersion), + built_at: new Date().toISOString(), + }); + + info( + `Native build orchestrator completed: ${result.nodeCount ?? 0} nodes, ${result.edgeCount ?? 0} edges, ${result.fileCount ?? 0} files`, + ); + + // ── Post-native structure + analysis ────────────────────────────── + let analysisTiming = { + astMs: +(p.astMs ?? 0), + complexityMs: +(p.complexityMs ?? 0), + cfgMs: +(p.cfgMs ?? 0), + dataflowMs: +(p.dataflowMs ?? 0), + }; + let structurePatchMs = 0; + // Skip JS structure when the Rust pipeline's small-incremental fast path + // already handled it. For full builds and large incrementals where Rust + // skipped structure, we must run the JS fallback. + const needsStructure = !result.structureHandled; + // When the Rust addon doesn't include analysis persistence (older addon + // version or analysis failed), fall back to JS-side analysis. + const needsAnalysisFallback = + !result.analysisComplete && + (ctx.opts.ast !== false || + ctx.opts.complexity !== false || + ctx.opts.cfg !== false || + ctx.opts.dataflow !== false); + + if (needsStructure || needsAnalysisFallback) { + // When analysis fallback is needed, handoff to better-sqlite3 — the + // analysis engine uses the suspend/resume WAL pattern that requires a + // real better-sqlite3 connection, not the NativeDbProxy. + if (needsAnalysisFallback && ctx.nativeFirstProxy) { + closeNativeDb(ctx, 'pre-analysis-fallback'); + ctx.db = openDb(ctx.dbPath); + ctx.nativeFirstProxy = false; + } else if (!ctx.nativeFirstProxy && !handoffWalAfterNativeBuild(ctx)) { + // DB reopen failed — return partial result + return formatNativeTimingResult(p, 0, analysisTiming); + } + + const fileSymbols = reconstructFileSymbolsFromDb(ctx); + + if (needsStructure) { + structurePatchMs = await runPostNativeStructure( + ctx, + fileSymbols, + !!result.isFullBuild, + result.changedFiles, + ); + } + + if (needsAnalysisFallback) { + analysisTiming = await runPostNativeAnalysis(ctx, fileSymbols, result.changedFiles); + } + } + + // Engine parity: the native orchestrator silently drops files whose + // Rust extractor/grammar is missing or fails (e.g. HCL, Scala, Swift on + // stale native binaries). WASM handles those — backfill via WASM so both + // engines process the same file set (#967). + // + // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for + // both gating and the backfill itself. On dirty incrementals/full builds + // the orchestrator signals trigger backfill, so the walk happens once + // (instead of redundantly inside backfill). On quiet incrementals we + // still pay the walk so we can detect brand-new files in dropped-language + // extensions — a gap that the orchestrator's `detect_removed_files` + // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap + // because the expensive part (WASM re-parse of the missing set) is + // gated below. + const removedCount = result.removedCount ?? 0; + const changedCount = result.changedCount ?? 0; + const gap = detectDroppedLanguageGap(ctx); + if ( + result.isFullBuild || + removedCount > 0 || + changedCount > 0 || + gap.missingAbs.length > 0 || + gap.staleRel.length > 0 + ) { + await backfillNativeDroppedFiles(ctx, gap); + } + + closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); + return formatNativeTimingResult(p, structurePatchMs, analysisTiming); +} From 6637066bc6525af444f9e64c75380fab058ea437 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:44:34 -0600 Subject: [PATCH 11/27] refactor(builder): decompose builder stages and adopt shared helpers --- src/domain/graph/builder/helpers.ts | 161 +++++---- src/domain/graph/builder/incremental.ts | 266 +++++++++----- .../graph/builder/stages/build-edges.ts | 338 ++++++++++-------- .../graph/builder/stages/build-structure.ts | 197 +++++----- .../graph/builder/stages/detect-changes.ts | 171 +++++---- src/domain/graph/builder/stages/finalize.ts | 142 ++++---- .../graph/builder/stages/insert-nodes.ts | 274 +++++++------- 7 files changed, 908 insertions(+), 641 deletions(-) diff --git a/src/domain/graph/builder/helpers.ts b/src/domain/graph/builder/helpers.ts index c6cbd4845..4b3665a5d 100644 --- a/src/domain/graph/builder/helpers.ts +++ b/src/domain/graph/builder/helpers.ts @@ -76,108 +76,117 @@ export function passesIncludeExclude( return true; } +/** Per-walk state computed once at the top-level invocation. */ +interface CollectContext { + readonly rootDir: string; + readonly includeRegexes: readonly RegExp[]; + readonly excludeRegexes: readonly RegExp[]; + readonly hasGlobFilters: boolean; + readonly extraIgnore: Set | null; + readonly visited: Set; +} + +/** Detect a symlink loop for `dir`. Returns true if `dir` was already visited. */ +function isSymlinkLoop(dir: string, visited: Set): boolean { + let realDir: string; + try { + realDir = fs.realpathSync(dir); + } catch { + return true; + } + if (visited.has(realDir)) { + warn(`Symlink loop detected, skipping: ${dir}`); + return true; + } + visited.add(realDir); + return false; +} + +/** Read directory entries, returning null on error (already logged). */ +function readDirSafe(dir: string): fs.Dirent[] | null { + try { + return fs.readdirSync(dir, { withFileTypes: true }); + } catch (err: unknown) { + warn(`Cannot read directory ${dir}: ${(err as Error).message}`); + return null; + } +} + +/** True if `entry` is a source file we should collect under `ctx`. */ +function isCollectableSourceFile(full: string, entry: fs.Dirent, ctx: CollectContext): boolean { + if (!EXTENSIONS.has(path.extname(entry.name))) return false; + if (!ctx.hasGlobFilters) return true; + const rel = normalizePath(path.relative(ctx.rootDir, full)); + return passesIncludeExclude(rel, ctx.includeRegexes, ctx.excludeRegexes); +} + +function walkCollect( + dir: string, + files: string[], + directories: Set | null, + ctx: CollectContext, +): void { + if (isSymlinkLoop(dir, ctx.visited)) return; + + const entries = readDirSafe(dir); + if (!entries) return; + + let hasFiles = false; + for (const entry of entries) { + if (shouldSkipEntry(entry, ctx.extraIgnore)) continue; + + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + walkCollect(full, files, directories, ctx); + } else if (isCollectableSourceFile(full, entry, ctx)) { + files.push(full); + hasFiles = true; + } + } + if (directories && hasFiles) { + directories.add(dir); + } +} + /** * Recursively collect all source files under `dir`. * When `directories` is a Set, also tracks which directories contain files. * - * The first invocation establishes `dir` as the project root against which - * `config.include` / `config.exclude` globs are matched. + * `dir` establishes the project root against which `config.include` / + * `config.exclude` globs are matched. */ export function collectFiles( dir: string, files: string[], config: Partial, directories: Set, - _visited?: Set, - _rootDir?: string, - _includeRegexes?: readonly RegExp[], - _excludeRegexes?: readonly RegExp[], ): { files: string[]; directories: Set }; export function collectFiles( dir: string, files?: string[], config?: Partial, directories?: null, - _visited?: Set, - _rootDir?: string, - _includeRegexes?: readonly RegExp[], - _excludeRegexes?: readonly RegExp[], ): string[]; export function collectFiles( dir: string, files: string[] = [], config: Partial = {}, directories: Set | null = null, - _visited: Set = new Set(), - _rootDir?: string, - _includeRegexes?: readonly RegExp[], - _excludeRegexes?: readonly RegExp[], ): string[] | { files: string[]; directories: Set } { const trackDirs = directories instanceof Set; - let hasFiles = false; - - // First call: compute root and compile include/exclude patterns once, - // then pass them down recursive calls so we don't recompile per directory. - const rootDir = _rootDir ?? dir; - const includeRegexes = _includeRegexes ?? compileGlobs(config.include); - const excludeRegexes = _excludeRegexes ?? compileGlobs(config.exclude); - const hasGlobFilters = includeRegexes.length > 0 || excludeRegexes.length > 0; - - // Merge config ignoreDirs with defaults - const extraIgnore = config.ignoreDirs ? new Set(config.ignoreDirs) : null; - - // Detect symlink loops (before I/O to avoid wasted readdirSync) - let realDir: string; - try { - realDir = fs.realpathSync(dir); - } catch { - return trackDirs ? { files, directories: directories as Set } : files; - } - if (_visited.has(realDir)) { - warn(`Symlink loop detected, skipping: ${dir}`); - return trackDirs ? { files, directories: directories as Set } : files; - } - _visited.add(realDir); - - let entries: fs.Dirent[]; - try { - entries = fs.readdirSync(dir, { withFileTypes: true }); - } catch (err: unknown) { - warn(`Cannot read directory ${dir}: ${(err as Error).message}`); - return trackDirs ? { files, directories: directories as Set } : files; - } + const includeRegexes = compileGlobs(config.include); + const excludeRegexes = compileGlobs(config.exclude); + const ctx: CollectContext = { + rootDir: dir, + includeRegexes, + excludeRegexes, + hasGlobFilters: includeRegexes.length > 0 || excludeRegexes.length > 0, + extraIgnore: config.ignoreDirs ? new Set(config.ignoreDirs) : null, + visited: new Set(), + }; - for (const entry of entries) { - if (shouldSkipEntry(entry, extraIgnore)) continue; + walkCollect(dir, files, trackDirs ? (directories as Set) : null, ctx); - const full = path.join(dir, entry.name); - if (entry.isDirectory()) { - if (trackDirs) { - collectFiles( - full, - files, - config, - directories as Set, - _visited, - rootDir, - includeRegexes, - excludeRegexes, - ); - } else { - collectFiles(full, files, config, null, _visited, rootDir, includeRegexes, excludeRegexes); - } - } else if (EXTENSIONS.has(path.extname(entry.name))) { - if (hasGlobFilters) { - const rel = normalizePath(path.relative(rootDir, full)); - if (!passesIncludeExclude(rel, includeRegexes, excludeRegexes)) continue; - } - files.push(full); - hasFiles = true; - } - } - if (trackDirs && hasFiles) { - (directories as Set).add(dir); - } return trackDirs ? { files, directories: directories as Set } : files; } diff --git a/src/domain/graph/builder/incremental.ts b/src/domain/graph/builder/incremental.ts index 66853983e..d7aa488ed 100644 --- a/src/domain/graph/builder/incremental.ts +++ b/src/domain/graph/builder/incremental.ts @@ -307,6 +307,63 @@ function resolveBarrelImportEdges( return edgesAdded; } +/** Emit symbol-level `imports-type` edges for a single `import type` statement. */ +function emitTypeOnlySymbolEdges( + db: BetterSqlite3Database | null, + stmts: IncrementalStmts, + imp: ExtractorOutput['imports'][number], + resolvedPath: string, + fileNodeId: number, +): number { + let edgesAdded = 0; + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + let targetFile = resolvedPath; + if (db && isBarrelFile(db, resolvedPath)) { + const actual = resolveBarrelTarget(db, resolvedPath, cleanName); + if (actual) targetFile = actual; + } + const candidates = stmts.findNodeInFile.all(cleanName, targetFile) as Array<{ + id: number; + file: string; + }>; + if (candidates.length === 0) continue; + stmts.insertEdge.run(fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0); + edgesAdded++; + } + return edgesAdded; +} + +/** + * Process a single import statement: emit the file→file edge, any + * symbol-level type-only edges, and barrel re-export edges. + */ +function emitEdgesForImport( + stmts: IncrementalStmts, + imp: ExtractorOutput['imports'][number], + fileNodeId: number, + relPath: string, + rootDir: string, + aliases: PathAliases, + db: BetterSqlite3Database | null, +): number { + const resolvedPath = resolveImportPath(path.join(rootDir, relPath), imp.source, rootDir, aliases); + const targetRow = stmts.getNodeId.get(resolvedPath, 'file', resolvedPath, 0); + if (!targetRow) return 0; + + const edgeKind = imp.reexport ? 'reexports' : imp.typeOnly ? 'imports-type' : 'imports'; + stmts.insertEdge.run(fileNodeId, targetRow.id, edgeKind, 1.0, 0); + let edgesAdded = 1; + + if (imp.typeOnly) { + edgesAdded += emitTypeOnlySymbolEdges(db, stmts, imp, resolvedPath, fileNodeId); + } + if (!imp.reexport && db) { + edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeId, resolvedPath, imp); + } + return edgesAdded; +} + function buildImportEdges( stmts: IncrementalStmts, relPath: string, @@ -318,44 +375,7 @@ function buildImportEdges( ): number { let edgesAdded = 0; for (const imp of symbols.imports) { - const resolvedPath = resolveImportPath( - path.join(rootDir, relPath), - imp.source, - rootDir, - aliases, - ); - const targetRow = stmts.getNodeId.get(resolvedPath, 'file', resolvedPath, 0); - if (targetRow) { - const edgeKind = imp.reexport ? 'reexports' : imp.typeOnly ? 'imports-type' : 'imports'; - stmts.insertEdge.run(fileNodeId, targetRow.id, edgeKind, 1.0, 0); - edgesAdded++; - - // Type-only imports: create symbol-level edges so the target symbols - // get fan-in credit and aren't falsely classified as dead code. - if (imp.typeOnly) { - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - let targetFile = resolvedPath; - if (db && isBarrelFile(db, resolvedPath)) { - const actual = resolveBarrelTarget(db, resolvedPath, cleanName); - if (actual) targetFile = actual; - } - const candidates = stmts.findNodeInFile.all(cleanName, targetFile) as Array<{ - id: number; - file: string; - }>; - if (candidates.length > 0) { - stmts.insertEdge.run(fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0); - edgesAdded++; - } - } - } - - // Barrel resolution: create edges through re-export chains - if (!imp.reexport && db) { - edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeId, resolvedPath, imp); - } - } + edgesAdded += emitEdgesForImport(stmts, imp, fileNodeId, relPath, rootDir, aliases, db); } return edgesAdded; } @@ -491,6 +511,122 @@ function buildCallEdges( // ── Main entry point ──────────────────────────────────────────────────── +/** Build the "this file was deleted" result returned by `rebuildFile`. */ +function buildDeletionResult( + relPath: string, + oldNodes: number, + oldSymbols: unknown[], + diffSymbols: ((old: unknown[], new_: unknown[]) => unknown) | undefined, +): RebuildResult { + const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, []) : null; + return { + file: relPath, + nodesAdded: 0, + nodesRemoved: oldNodes, + edgesAdded: 0, + deleted: true, + event: 'deleted', + symbolDiff, + nodesBefore: oldNodes, + nodesAfter: 0, + }; +} + +/** Rebuild all edges originating in the single (just-parsed) target file. */ +function rebuildEdgesForTargetFile( + db: BetterSqlite3Database, + stmts: IncrementalStmts, + relPath: string, + symbols: ExtractorOutput, + fileNodeRow: { id: number }, + rootDir: string, +): number { + const aliases: PathAliases = { baseUrl: null, paths: {} }; + let edgesAdded = buildContainmentEdges(db, stmts, relPath, symbols); + edgesAdded += rebuildDirContainment(db, stmts, relPath); + edgesAdded += buildImportEdges(stmts, relPath, symbols, rootDir, fileNodeRow.id, aliases, db); + const importedNames = buildImportedNamesMap(symbols, rootDir, relPath, aliases); + edgesAdded += buildCallEdges(stmts, relPath, symbols, fileNodeRow, importedNames); + return edgesAdded; +} + +/** + * Re-parse the reverse-deps and delete their outgoing edges so the cascade + * can rebuild them. + */ +async function parseReverseDeps( + db: BetterSqlite3Database, + rootDir: string, + reverseDeps: string[], + engineOpts: EngineOpts, + cache: unknown, +): Promise> { + const depSymbols = new Map(); + for (const depRelPath of reverseDeps) { + const symbols_ = await parseReverseDep(rootDir, depRelPath, engineOpts, cache); + if (symbols_) { + deleteOutgoingEdges(db, depRelPath); + depSymbols.set(depRelPath, symbols_); + } + } + return depSymbols; +} + +/** + * Pass 2 of the reverse-dep cascade: now that the changed file's `reexports` + * edges exist, resolve barrel imports for every reverse-dep so transitive + * call edges through the barrel still find their targets. + */ +function emitBarrelImportEdgesForReverseDeps( + db: BetterSqlite3Database, + stmts: IncrementalStmts, + depSymbols: Map, + rootDir: string, +): number { + let edgesAdded = 0; + for (const [depRelPath, symbols_] of depSymbols) { + const fileNodeRow_ = stmts.getNodeId.get(depRelPath, 'file', depRelPath, 0); + if (!fileNodeRow_) continue; + const aliases_: PathAliases = { baseUrl: null, paths: {} }; + for (const imp of symbols_.imports) { + if (imp.reexport) continue; + const resolvedPath = resolveImportPath( + path.join(rootDir, depRelPath), + imp.source, + rootDir, + aliases_, + ); + edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeRow_.id, resolvedPath, imp); + } + } + return edgesAdded; +} + +/** + * Two-pass reverse-dep cascade: + * 1. Rebuild direct edges (creating `reexports` edges for barrels). + * 2. Add barrel import edges (which need `reexports` edges to exist). + */ +async function runReverseDepCascade( + db: BetterSqlite3Database, + rootDir: string, + reverseDeps: string[], + stmts: IncrementalStmts, + engineOpts: EngineOpts, + cache: unknown, +): Promise { + const depSymbols = await parseReverseDeps(db, rootDir, reverseDeps, engineOpts, cache); + + let edgesAdded = 0; + // Pass 1: direct edges only (no barrel resolution) — creates reexports edges + for (const [depRelPath, symbols_] of depSymbols) { + edgesAdded += rebuildReverseDepEdges(db, rootDir, depRelPath, symbols_, stmts, true); + } + // Pass 2: add barrel import edges (reexports edges now exist) + edgesAdded += emitBarrelImportEdgesForReverseDeps(db, stmts, depSymbols, rootDir); + return edgesAdded; +} + /** * Parse a single file and update the database incrementally. */ @@ -519,18 +655,7 @@ export async function rebuildFile( if (!fs.existsSync(filePath)) { if (cache) (cache as { remove(p: string): void }).remove(filePath); - const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, []) : null; - return { - file: relPath, - nodesAdded: 0, - nodesRemoved: oldNodes, - edgesAdded: 0, - deleted: true, - event: 'deleted', - symbolDiff, - nodesBefore: oldNodes, - nodesAfter: 0, - }; + return buildDeletionResult(relPath, oldNodes, oldSymbols, diffSymbols); } let code: string; @@ -553,45 +678,8 @@ export async function rebuildFile( if (!fileNodeRow) return { file: relPath, nodesAdded: newNodes, nodesRemoved: oldNodes, edgesAdded: 0 }; - const aliases: PathAliases = { baseUrl: null, paths: {} }; - - let edgesAdded = buildContainmentEdges(db, stmts, relPath, symbols); - edgesAdded += rebuildDirContainment(db, stmts, relPath); - edgesAdded += buildImportEdges(stmts, relPath, symbols, rootDir, fileNodeRow.id, aliases, db); - const importedNames = buildImportedNamesMap(symbols, rootDir, relPath, aliases); - edgesAdded += buildCallEdges(stmts, relPath, symbols, fileNodeRow, importedNames); - - // Cascade: rebuild outgoing edges for reverse-dep files. - // Two-pass approach: first rebuild direct edges (creating reexports edges for barrels), - // then add barrel import edges (which need reexports edges to exist for resolution). - const depSymbols = new Map(); - for (const depRelPath of reverseDeps) { - const symbols_ = await parseReverseDep(rootDir, depRelPath, engineOpts, cache); - if (symbols_) { - deleteOutgoingEdges(db, depRelPath); - depSymbols.set(depRelPath, symbols_); - } - } - // Pass 1: direct edges only (no barrel resolution) — creates reexports edges - for (const [depRelPath, symbols_] of depSymbols) { - edgesAdded += rebuildReverseDepEdges(db, rootDir, depRelPath, symbols_, stmts, true); - } - // Pass 2: add barrel import edges (reexports edges now exist) - for (const [depRelPath, symbols_] of depSymbols) { - const fileNodeRow_ = stmts.getNodeId.get(depRelPath, 'file', depRelPath, 0); - if (!fileNodeRow_) continue; - const aliases_: PathAliases = { baseUrl: null, paths: {} }; - for (const imp of symbols_.imports) { - if (imp.reexport) continue; - const resolvedPath = resolveImportPath( - path.join(rootDir, depRelPath), - imp.source, - rootDir, - aliases_, - ); - edgesAdded += resolveBarrelImportEdges(db, stmts, fileNodeRow_.id, resolvedPath, imp); - } - } + let edgesAdded = rebuildEdgesForTargetFile(db, stmts, relPath, symbols, fileNodeRow, rootDir); + edgesAdded += await runReverseDepCascade(db, rootDir, reverseDeps, stmts, engineOpts, cache); const symbolDiff = diffSymbols ? diffSymbols(oldSymbols, newSymbols) : null; const event = oldNodes === 0 ? 'added' : 'modified'; diff --git a/src/domain/graph/builder/stages/build-edges.ts b/src/domain/graph/builder/stages/build-edges.ts index fc08160b3..9a531ed5c 100644 --- a/src/domain/graph/builder/stages/build-edges.ts +++ b/src/domain/graph/builder/stages/build-edges.ts @@ -89,12 +89,74 @@ function setupNodeLookups(ctx: PipelineContext, allNodes: QueryNodeRow[]): void // ── Import edges ──────────────────────────────────────────────────────── +/** Pick the edge kind for an import statement based on its modifiers. */ +function importEdgeKind(imp: Import): string { + if (imp.reexport) return 'reexports'; + if (imp.typeOnly) return 'imports-type'; + if (imp.dynamicImport) return 'dynamic-imports'; + return 'imports'; +} + +/** + * For a `import type` statement, emit symbol-level `imports-type` edges so + * the target symbols get fan-in credit and aren't classified as dead code. + */ +function emitTypeOnlySymbolEdges( + ctx: PipelineContext, + imp: Import, + resolvedPath: string, + fileNodeId: number, + allEdgeRows: EdgeRowTuple[], +): void { + if (!ctx.nodesByNameAndFile) return; + for (const name of imp.names) { + const cleanName = name.replace(/^\*\s+as\s+/, ''); + let targetFile = resolvedPath; + if (isBarrelFile(ctx, resolvedPath)) { + const actual = resolveBarrelExport(ctx, resolvedPath, cleanName); + if (actual) targetFile = actual; + } + const candidates = ctx.nodesByNameAndFile.get(`${cleanName}|${targetFile}`); + if (candidates && candidates.length > 0) { + allEdgeRows.push([fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0]); + } + } +} + +/** + * Process a single import statement and emit all resulting edges (file→file, + * type-only symbol-level, and barrel re-export targets). + */ +function emitEdgesForImport( + ctx: PipelineContext, + imp: Import, + fileNodeId: number, + relPath: string, + getNodeIdStmt: NodeIdStmt, + allEdgeRows: EdgeRowTuple[], +): void { + const resolvedPath = getResolved(ctx, path.join(ctx.rootDir, relPath), imp.source); + const targetRow = getNodeIdStmt.get(resolvedPath, 'file', resolvedPath, 0); + if (!targetRow) return; + + const edgeKind = importEdgeKind(imp); + allEdgeRows.push([fileNodeId, targetRow.id, edgeKind, 1.0, 0]); + + if (imp.typeOnly) { + emitTypeOnlySymbolEdges(ctx, imp, resolvedPath, fileNodeId, allEdgeRows); + } + + if (!imp.reexport && isBarrelFile(ctx, resolvedPath)) { + buildBarrelEdges(ctx, imp, resolvedPath, fileNodeId, edgeKind, getNodeIdStmt, allEdgeRows); + } +} + function buildImportEdges( ctx: PipelineContext, getNodeIdStmt: NodeIdStmt, allEdgeRows: EdgeRowTuple[], ): void { - const { fileSymbols, barrelOnlyFiles, rootDir } = ctx; + const { fileSymbols, barrelOnlyFiles } = ctx; for (const [relPath, symbols] of fileSymbols) { const isBarrelOnly = barrelOnlyFiles.has(relPath); @@ -105,40 +167,7 @@ function buildImportEdges( for (const imp of symbols.imports) { // Barrel-only files: only emit reexport edges, skip regular imports if (isBarrelOnly && !imp.reexport) continue; - - const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); - const targetRow = getNodeIdStmt.get(resolvedPath, 'file', resolvedPath, 0); - if (!targetRow) continue; - - const edgeKind = imp.reexport - ? 'reexports' - : imp.typeOnly - ? 'imports-type' - : imp.dynamicImport - ? 'dynamic-imports' - : 'imports'; - allEdgeRows.push([fileNodeId, targetRow.id, edgeKind, 1.0, 0]); - - // Type-only imports: create symbol-level edges so the target symbols - // get fan-in credit and aren't falsely classified as dead code. - if (imp.typeOnly && ctx.nodesByNameAndFile) { - for (const name of imp.names) { - const cleanName = name.replace(/^\*\s+as\s+/, ''); - let targetFile = resolvedPath; - if (isBarrelFile(ctx, resolvedPath)) { - const actual = resolveBarrelExport(ctx, resolvedPath, cleanName); - if (actual) targetFile = actual; - } - const candidates = ctx.nodesByNameAndFile.get(`${cleanName}|${targetFile}`); - if (candidates && candidates.length > 0) { - allEdgeRows.push([fileNodeId, candidates[0]!.id, 'imports-type', 1.0, 0]); - } - } - } - - if (!imp.reexport && isBarrelFile(ctx, resolvedPath)) { - buildBarrelEdges(ctx, imp, resolvedPath, fileNodeId, edgeKind, getNodeIdStmt, allEdgeRows); - } + emitEdgesForImport(ctx, imp, fileNodeId, relPath, getNodeIdStmt, allEdgeRows); } } } @@ -174,83 +203,98 @@ function buildBarrelEdges( // ── Import edges (native engine) ──────────────────────────────────────── -function buildImportEdgesNative( - ctx: PipelineContext, - getNodeIdStmt: NodeIdStmt, - allEdgeRows: EdgeRowTuple[], - native: NativeAddon, -): void { - const { fileSymbols, barrelOnlyFiles, rootDir } = ctx; +/** Native FFI input shape for a single import statement. */ +interface NativeImportInfo { + source: string; + names: string[]; + reexport: boolean; + typeOnly: boolean; + dynamicImport: boolean; + wildcardReexport: boolean; +} - // 1. Build per-file input data - const files: Array<{ - file: string; - fileNodeId: number; - isBarrelOnly: boolean; - imports: Array<{ - source: string; - names: string[]; - reexport: boolean; - typeOnly: boolean; - dynamicImport: boolean; - wildcardReexport: boolean; - }>; - definitionNames: string[]; - }> = []; - - // Collect all file node IDs we'll need (sources + targets) - const fileNodeIds: Array<{ file: string; nodeId: number }> = []; - const seenNodeFiles = new Set(); - - const addFileNodeId = (relPath: string): { id: number } | undefined => { - if (seenNodeFiles.has(relPath)) return fileNodeRowCache.get(relPath); - const row = getNodeIdStmt.get(relPath, 'file', relPath, 0); - if (row) { - seenNodeFiles.add(relPath); - fileNodeIds.push({ file: relPath, nodeId: row.id }); - fileNodeRowCache.set(relPath, row); - } - return row; +/** Native FFI input shape for a single file. */ +interface NativeFileInput { + file: string; + fileNodeId: number; + isBarrelOnly: boolean; + imports: NativeImportInfo[]; + definitionNames: string[]; +} + +/** Native FFI input shape for re-exports of a single file. */ +interface NativeReexportInput { + file: string; + reexports: Array<{ source: string; names: string[]; wildcardReexport: boolean }>; +} + +/** Lazily-resolving cache of file-node rows for the native input arrays. */ +interface FileNodeIdRegistry { + ids: Array<{ file: string; nodeId: number }>; + add(relPath: string): { id: number } | undefined; +} + +function createFileNodeIdRegistry(getNodeIdStmt: NodeIdStmt): FileNodeIdRegistry { + const ids: Array<{ file: string; nodeId: number }> = []; + const seen = new Set(); + const cache = new Map(); + return { + ids, + add(relPath: string) { + if (seen.has(relPath)) return cache.get(relPath); + const row = getNodeIdStmt.get(relPath, 'file', relPath, 0); + if (row) { + seen.add(relPath); + ids.push({ file: relPath, nodeId: row.id }); + cache.set(relPath, row); + } + return row; + }, }; - const fileNodeRowCache = new Map(); +} - // 2. Pre-resolve all imports and build resolved imports array. - // Keys use forward-slash-normalized rootDir + "/" + relPath to match the Rust - // lookup format (format!("{}/{}", root_dir.replace('\\', "/"), file)). - // On Windows, rootDir has backslashes but Rust normalizes them — the JS side - // must do the same or every resolve key lookup misses (#750). - const resolvedImports: Array<{ key: string; resolvedPath: string }> = []; +function toNativeImportInfo(imp: Import): NativeImportInfo { + return { + source: imp.source, + names: imp.names, + reexport: !!imp.reexport, + typeOnly: !!imp.typeOnly, + dynamicImport: !!imp.dynamicImport, + wildcardReexport: !!imp.wildcardReexport, + }; +} + +/** + * Pre-resolve every import for the given files, registering each resolved + * target with the registry so the native side has full node-id coverage. + * + * Resolved-import keys use forward-slash-normalized rootDir + "/" + relPath to + * match the Rust lookup format. On Windows, rootDir has backslashes but Rust + * normalizes them — the JS side must do the same or every key lookup misses + * (#750). + */ +function buildNativeFileInputs( + ctx: PipelineContext, + registry: FileNodeIdRegistry, +): { + files: NativeFileInput[]; + resolvedImports: Array<{ key: string; resolvedPath: string }>; +} { + const { fileSymbols, barrelOnlyFiles, rootDir } = ctx; const fwdRootDir = rootDir.replace(/\\/g, '/'); + const files: NativeFileInput[] = []; + const resolvedImports: Array<{ key: string; resolvedPath: string }> = []; for (const [relPath, symbols] of fileSymbols) { - const fileNodeRow = addFileNodeId(relPath); + const fileNodeRow = registry.add(relPath); if (!fileNodeRow) continue; - const importInfos: Array<{ - source: string; - names: string[]; - reexport: boolean; - typeOnly: boolean; - dynamicImport: boolean; - wildcardReexport: boolean; - }> = []; - + const importInfos: NativeImportInfo[] = []; for (const imp of symbols.imports) { - // Pre-resolve and register target file node const resolvedPath = getResolved(ctx, path.join(rootDir, relPath), imp.source); - addFileNodeId(resolvedPath); - - // Key matches Rust's format!("{}/{}", root_dir.replace('\\', "/"), file_input.file) + registry.add(resolvedPath); resolvedImports.push({ key: `${fwdRootDir}/${relPath}|${imp.source}`, resolvedPath }); - - importInfos.push({ - source: imp.source, - names: imp.names, - reexport: !!imp.reexport, - typeOnly: !!imp.typeOnly, - dynamicImport: !!imp.dynamicImport, - wildcardReexport: !!imp.wildcardReexport, - }); + importInfos.push(toNativeImportInfo(imp)); } files.push({ @@ -261,61 +305,75 @@ function buildImportEdgesNative( definitionNames: symbols.definitions.map((d) => d.name), }); } + return { files, resolvedImports }; +} - // 4. Flatten reexportMap - const fileReexports: Array<{ - file: string; - reexports: Array<{ - source: string; - names: string[]; - wildcardReexport: boolean; - }>; - }> = []; - if (ctx.reexportMap) { - for (const [file, entries] of ctx.reexportMap) { - const reexports = ( - entries as Array<{ source: string; names: string[]; wildcardReexport: boolean }> - ).map((re) => ({ - source: re.source, - names: re.names, - wildcardReexport: !!re.wildcardReexport, - })); - fileReexports.push({ file, reexports }); +/** Flatten `ctx.reexportMap` into the array shape the native side expects. */ +function buildNativeReexports( + ctx: PipelineContext, + registry: FileNodeIdRegistry, +): NativeReexportInput[] { + const fileReexports: NativeReexportInput[] = []; + if (!ctx.reexportMap) return fileReexports; + + for (const [file, entries] of ctx.reexportMap) { + const reexports = ( + entries as Array<{ source: string; names: string[]; wildcardReexport: boolean }> + ).map((re) => ({ + source: re.source, + names: re.names, + wildcardReexport: !!re.wildcardReexport, + })); + fileReexports.push({ file, reexports }); - // Register reexport target files for node ID lookup - for (const re of reexports) { - addFileNodeId(re.source); - } + for (const re of reexports) { + registry.add(re.source); } } + return fileReexports; +} - // 5. Compute barrel file list +function collectBarrelFiles(ctx: PipelineContext): string[] { const barrelFiles: string[] = []; - for (const [relPath] of fileSymbols) { - if (isBarrelFile(ctx, relPath)) { - barrelFiles.push(relPath); - } + for (const [relPath] of ctx.fileSymbols) { + if (isBarrelFile(ctx, relPath)) barrelFiles.push(relPath); } + return barrelFiles; +} - // 6. Build symbol node entries for type-only import resolution +function collectSymbolNodes( + ctx: PipelineContext, +): Array<{ name: string; file: string; nodeId: number }> { const symbolNodes: Array<{ name: string; file: string; nodeId: number }> = []; - if (ctx.nodesByNameAndFile) { - for (const [key, nodes] of ctx.nodesByNameAndFile) { - if (nodes.length > 0) { - const [name, file] = key.split('|'); - symbolNodes.push({ name: name!, file: file!, nodeId: nodes[0]!.id }); - } - } + if (!ctx.nodesByNameAndFile) return symbolNodes; + for (const [key, nodes] of ctx.nodesByNameAndFile) { + if (nodes.length === 0) continue; + const [name, file] = key.split('|'); + symbolNodes.push({ name: name!, file: file!, nodeId: nodes[0]!.id }); } + return symbolNodes; +} + +function buildImportEdgesNative( + ctx: PipelineContext, + getNodeIdStmt: NodeIdStmt, + allEdgeRows: EdgeRowTuple[], + native: NativeAddon, +): void { + const registry = createFileNodeIdRegistry(getNodeIdStmt); + + const { files, resolvedImports } = buildNativeFileInputs(ctx, registry); + const fileReexports = buildNativeReexports(ctx, registry); + const barrelFiles = collectBarrelFiles(ctx); + const symbolNodes = collectSymbolNodes(ctx); - // 7. Call native const nativeEdges = native.buildImportEdges!( files, resolvedImports, fileReexports, - fileNodeIds, + registry.ids, barrelFiles, - rootDir, + ctx.rootDir, symbolNodes, ) as NativeEdge[]; diff --git a/src/domain/graph/builder/stages/build-structure.ts b/src/domain/graph/builder/stages/build-structure.ts index 1a59353be..144537dfe 100644 --- a/src/domain/graph/builder/stages/build-structure.ts +++ b/src/domain/graph/builder/stages/build-structure.ts @@ -11,87 +11,104 @@ import type { ExtractorOutput } from '../../../../types.js'; import type { PipelineContext } from '../context.js'; import { readFileSafe } from '../helpers.js'; -export async function buildStructure(ctx: PipelineContext): Promise { - const { db, fileSymbols, rootDir, discoveredDirs, allSymbols, isFullBuild } = ctx; - - // Build line count map (prefer cached _lineCount from parser) +/** Populate `ctx.lineCountMap` from cached parser results, falling back to disk. */ +function populateLineCountMap(ctx: PipelineContext): void { + const { fileSymbols, rootDir } = ctx; ctx.lineCountMap = new Map(); for (const [relPath, symbols] of fileSymbols) { const lineCount = (symbols as ExtractorOutput & { lineCount?: number }).lineCount ?? symbols._lineCount; if (lineCount) { ctx.lineCountMap.set(relPath, lineCount); - } else { - const absPath = path.join(rootDir, relPath); - try { - const content = readFileSafe(absPath); - ctx.lineCountMap.set(relPath, content.split('\n').length); - } catch { - ctx.lineCountMap.set(relPath, 0); - } + continue; + } + const absPath = path.join(rootDir, relPath); + try { + const content = readFileSafe(absPath); + ctx.lineCountMap.set(relPath, content.split('\n').length); + } catch { + ctx.lineCountMap.set(relPath, 0); } } +} - const changedFileList = isFullBuild ? null : [...allSymbols.keys()]; - - // For small incremental builds on large codebases, use a fast path that - // updates only the changed files' metrics via targeted SQL instead of - // loading ALL definitions from DB (~8ms) and recomputing ALL metrics (~15ms). - // Gate: ≤smallFilesThreshold changed files AND significantly more existing files (>20) to - // avoid triggering on small test fixtures where directory metrics matter. +/** Count file-kind nodes already in the DB, preferring the native connection. */ +function countExistingFiles(ctx: PipelineContext): number { const useNativeReads = ctx.engineName === 'native' && !!ctx.nativeDb; - const existingFileCount = !isFullBuild - ? ( - (useNativeReads - ? ctx.nativeDb!.queryGet("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'", []) - : db.prepare("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'").get()) as { - c: number; - } - ).c - : 0; - const useSmallIncrementalFastPath = - !isFullBuild && - changedFileList != null && - changedFileList.length <= ctx.config.build.smallFilesThreshold && - existingFileCount > 20; - - if (!isFullBuild && !useSmallIncrementalFastPath) { - // Medium/large incremental: load unchanged files from DB for complete structure - loadUnchangedFilesFromDb(ctx); - } + const row = ( + useNativeReads + ? ctx.nativeDb!.queryGet("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'", []) + : ctx.db.prepare("SELECT COUNT(*) as c FROM nodes WHERE kind = 'file'").get() + ) as { c: number }; + return row.c; +} - // Build directory structure - const t0 = performance.now(); +/** + * Build directory structure + metrics. Chooses between the fast incremental + * path (a handful of files changed on a large codebase) and the full path + * (delegated to `features/structure`). + */ +async function buildDirectoryStructure( + ctx: PipelineContext, + changedFileList: string[] | null, + useSmallIncrementalFastPath: boolean, +): Promise { if (useSmallIncrementalFastPath) { updateChangedFileMetrics(ctx, changedFileList!); - } else { - const relDirs = new Set(); - for (const absDir of discoveredDirs) { - relDirs.add(normalizePath(path.relative(rootDir, absDir))); - } - try { - const { buildStructure: buildStructureFn } = (await import( - '../../../../features/structure.js' - )) as { - buildStructure: ( - db: PipelineContext['db'], - fileSymbols: Map, - rootDir: string, - lineCountMap: Map, - directories: Set, - changedFiles: string[] | null, - ) => void; - }; - const changedFilePaths = isFullBuild ? null : [...allSymbols.keys()]; - buildStructureFn(db, fileSymbols, rootDir, ctx.lineCountMap, relDirs, changedFilePaths); - } catch (err) { - debug(`Structure analysis failed: ${(err as Error).message}`); - } + return; } - ctx.timing.structureMs = performance.now() - t0; - // Classify node roles (incremental: only reclassify changed files' nodes) - const t1 = performance.now(); + const { db, fileSymbols, rootDir, discoveredDirs, allSymbols, isFullBuild } = ctx; + const relDirs = new Set(); + for (const absDir of discoveredDirs) { + relDirs.add(normalizePath(path.relative(rootDir, absDir))); + } + try { + const { buildStructure: buildStructureFn } = (await import( + '../../../../features/structure.js' + )) as { + buildStructure: ( + db: PipelineContext['db'], + fileSymbols: Map, + rootDir: string, + lineCountMap: Map, + directories: Set, + changedFiles: string[] | null, + ) => void; + }; + const changedFilePaths = isFullBuild ? null : [...allSymbols.keys()]; + buildStructureFn(db, fileSymbols, rootDir, ctx.lineCountMap, relDirs, changedFilePaths); + } catch (err) { + debug(`Structure analysis failed: ${(err as Error).message}`); + } +} + +/** Convert a `NativeDatabase.classifyRoles*` result into the JS summary shape. */ +function nativeRoleSummaryToRecord( + nativeResult: NonNullable< + ReturnType['classifyRolesFull']> + >, +): Record { + return { + entry: nativeResult.entry, + core: nativeResult.core, + utility: nativeResult.utility, + adapter: nativeResult.adapter, + dead: nativeResult.dead, + 'dead-leaf': nativeResult.deadLeaf, + 'dead-entry': nativeResult.deadEntry, + 'dead-ffi': nativeResult.deadFfi, + 'dead-unresolved': nativeResult.deadUnresolved, + 'test-only': nativeResult.testOnly, + leaf: nativeResult.leaf, + }; +} + +async function classifyRoles( + ctx: PipelineContext, + changedFileList: string[] | null, +): Promise { + const useNativeReads = ctx.engineName === 'native' && !!ctx.nativeDb; try { let roleSummary: Record | null = null; @@ -103,24 +120,9 @@ export async function buildStructure(ctx: PipelineContext): Promise { changedFileList && changedFileList.length > 0 ? ctx.nativeDb.classifyRolesIncremental(changedFileList) : ctx.nativeDb.classifyRolesFull(); - if (nativeResult) { - roleSummary = { - entry: nativeResult.entry, - core: nativeResult.core, - utility: nativeResult.utility, - adapter: nativeResult.adapter, - dead: nativeResult.dead, - 'dead-leaf': nativeResult.deadLeaf, - 'dead-entry': nativeResult.deadEntry, - 'dead-ffi': nativeResult.deadFfi, - 'dead-unresolved': nativeResult.deadUnresolved, - 'test-only': nativeResult.testOnly, - leaf: nativeResult.leaf, - }; - } + if (nativeResult) roleSummary = nativeRoleSummaryToRecord(nativeResult); } - // Fall back to JS path if (!roleSummary) { const { classifyNodeRoles } = (await import('../../../../features/structure.js')) as { classifyNodeRoles: ( @@ -141,6 +143,37 @@ export async function buildStructure(ctx: PipelineContext): Promise { } catch (err) { debug(`Role classification failed: ${(err as Error).message}`); } +} + +export async function buildStructure(ctx: PipelineContext): Promise { + const { allSymbols, isFullBuild } = ctx; + + populateLineCountMap(ctx); + + const changedFileList = isFullBuild ? null : [...allSymbols.keys()]; + + // For small incremental builds on large codebases, use a fast path that + // updates only the changed files' metrics via targeted SQL instead of + // loading ALL definitions from DB (~8ms) and recomputing ALL metrics (~15ms). + // Gate: ≤smallFilesThreshold changed files AND significantly more existing files (>20) to + // avoid triggering on small test fixtures where directory metrics matter. + const existingFileCount = !isFullBuild ? countExistingFiles(ctx) : 0; + const useSmallIncrementalFastPath = + !isFullBuild && + changedFileList != null && + changedFileList.length <= ctx.config.build.smallFilesThreshold && + existingFileCount > 20; + + if (!isFullBuild && !useSmallIncrementalFastPath) { + loadUnchangedFilesFromDb(ctx); + } + + const t0 = performance.now(); + await buildDirectoryStructure(ctx, changedFileList, useSmallIncrementalFastPath); + ctx.timing.structureMs = performance.now() - t0; + + const t1 = performance.now(); + await classifyRoles(ctx, changedFileList); ctx.timing.rolesMs = performance.now() - t1; } diff --git a/src/domain/graph/builder/stages/detect-changes.ts b/src/domain/graph/builder/stages/detect-changes.ts index cc51155dc..222d92e42 100644 --- a/src/domain/graph/builder/stages/detect-changes.ts +++ b/src/domain/graph/builder/stages/detect-changes.ts @@ -162,14 +162,14 @@ function tryJournalTier( return { changed, removed: [...removedSet], isFullBuild: false }; } -function mtimeAndHashTiers( +/** Tier 1: mtime+size triage. Returns the files that still need hashing. */ +function tierMtimeSize( existing: Map, allFiles: string[], rootDir: string, - removed: string[], -): ChangeResult { +): { needsHash: NeedsHashItem[]; skipped: number } { const needsHash: NeedsHashItem[] = []; - const skipped: string[] = []; + let skipped = 0; for (const file of allFiles) { const relPath = normalizePath(path.relative(rootDir, file)); @@ -183,16 +183,17 @@ function mtimeAndHashTiers( const storedMtime = record.mtime || 0; const storedSize = record.size || 0; if (storedSize > 0 && stat.mtime === storedMtime && stat.size === storedSize) { - skipped.push(relPath); + skipped++; continue; } needsHash.push({ file, relPath, stat }); } - if (needsHash.length > 0) { - debug(`Tier 1: ${skipped.length} skipped by mtime+size, ${needsHash.length} need hash check`); - } + return { needsHash, skipped }; +} +/** Tier 2: hash candidates from tier 1, classifying changed vs metadata-only. */ +function tierHash(existing: Map, needsHash: NeedsHashItem[]): ChangedFile[] { const changed: ChangedFile[] = []; for (const item of needsHash) { let content: string | undefined; @@ -217,11 +218,26 @@ function mtimeAndHashTiers( }); } } + return changed; +} + +function mtimeAndHashTiers( + existing: Map, + allFiles: string[], + rootDir: string, + removed: string[], +): ChangeResult { + const { needsHash, skipped } = tierMtimeSize(existing, allFiles, rootDir); + if (needsHash.length > 0) { + debug(`Tier 1: ${skipped} skipped by mtime+size, ${needsHash.length} need hash check`); + } + + const changed = tierHash(existing, needsHash); - const parseChanged = changed.filter((c) => !c.metadataOnly); if (needsHash.length > 0) { + const parseChangedLen = changed.filter((c) => !c.metadataOnly).length; debug( - `Tier 2: ${parseChanged.length} actually changed, ${changed.length - parseChanged.length} metadata-only`, + `Tier 2: ${parseChangedLen} actually changed, ${changed.length - parseChangedLen} metadata-only`, ); } @@ -512,61 +528,43 @@ function handleIncrementalBuild(ctx: PipelineContext): void { purgeAndAddReverseDeps(ctx, changePaths, reverseDeps); } -/** - * Read-only pre-flight check for the native orchestrator. - * - * Returns true iff every collected source file has matching mtime+size in - * `file_hashes` and no DB-tracked file has been removed. When true, the - * caller can short-circuit before invoking the native orchestrator — - * matching WASM's ~20 ms early-exit path and avoiding the ~2s flat - * per-call native rebuild overhead seen in CI (#1054). - * - * Intentionally Tier-0/Tier-1 only (journal + mtime/size). Tier-2 content - * hashing is left to the native side: when this returns false the caller - * falls through to the orchestrator, which performs its own complete - * detection and is the source of truth. - * - * Conservatively returns false when CFG or dataflow analysis is enabled - * but the corresponding tables are empty — otherwise the fast-skip would - * silently suppress the pending-analysis pass that the JS path runs via - * `runPendingAnalysis`, and CFG/dataflow data would never populate on - * repos where source files don't change between builds. - * - * Pure read of `db` and the filesystem — never mutates either. - */ -export function detectNoChanges( - db: BetterSqlite3Database, - allFiles: string[], - rootDir: string, - opts?: Record, -): boolean { - // Diagnostic logging gated by env var — used by the bench gate to surface - // why the fast-skip is not firing on CI runners (#1066). Off by default to - // avoid noise on every regular incremental build. +/** Diagnostic logger gated by env var, used by both `detectNoChanges` branches. */ +function makeFastSkipLogger(): (reason: string) => void { const diag = process.env.CODEGRAPH_FAST_SKIP_DIAG === '1'; - const log = (reason: string): void => { + return (reason: string): void => { if (diag) info(`[fast-skip] ${reason}`); }; +} - let hasTable = false; +/** + * Load the `file_hashes` table for the no-change pre-flight. Returns null + * if the table is missing or empty (both → caller must fall through). + */ +function loadFileHashesForPreflight( + db: BetterSqlite3Database, + log: (reason: string) => void, +): Map | null { try { db.prepare('SELECT 1 FROM file_hashes LIMIT 1').get(); - hasTable = true; } catch { - /* table missing — first build */ - } - if (!hasTable) { log('false: file_hashes table missing'); - return false; + return null; } - const rows = db.prepare('SELECT file, hash, mtime, size FROM file_hashes').all() as FileHashRow[]; if (rows.length === 0) { log('false: file_hashes table empty'); - return false; + return null; } - const existing = new Map(rows.map((r) => [r.file, r])); + return new Map(rows.map((r) => [r.file, r])); +} +/** Returns true iff every file in `allFiles` matches a stored mtime+size record. */ +function allFilesMatchStoredStat( + existing: Map, + allFiles: string[], + rootDir: string, + log: (reason: string) => void, +): boolean { const currentFiles = new Set(); for (const file of allFiles) { currentFiles.add(normalizePath(path.relative(rootDir, file))); @@ -603,21 +601,66 @@ export function detectNoChanges( return false; } } + return true; +} - // Pending-analysis guard: if CFG/dataflow is enabled but the corresponding - // table is empty (analysis newly enabled, or tables wiped between builds), - // fall through so the orchestrator / JS pipeline can run runPendingAnalysis. - // Mirrors the check at the top of runPendingAnalysis (see line ~244). - if (opts) { - if (opts.cfg !== false && hasEmptyAnalysisTable(db, 'cfg_blocks')) { - log('false: pending-analysis guard — cfg_blocks is empty'); - return false; - } - if (opts.dataflow !== false && hasEmptyAnalysisTable(db, 'dataflow')) { - log('false: pending-analysis guard — dataflow is empty'); - return false; - } +/** + * Pending-analysis guard: if CFG/dataflow is enabled but the corresponding + * table is empty (analysis newly enabled, or tables wiped between builds), + * fall through so the orchestrator / JS pipeline can run runPendingAnalysis. + * Mirrors the check at the top of runPendingAnalysis. + */ +function passesPendingAnalysisGuard( + db: BetterSqlite3Database, + opts: Record | undefined, + log: (reason: string) => void, +): boolean { + if (!opts) return true; + if (opts.cfg !== false && hasEmptyAnalysisTable(db, 'cfg_blocks')) { + log('false: pending-analysis guard — cfg_blocks is empty'); + return false; } + if (opts.dataflow !== false && hasEmptyAnalysisTable(db, 'dataflow')) { + log('false: pending-analysis guard — dataflow is empty'); + return false; + } + return true; +} + +/** + * Read-only pre-flight check for the native orchestrator. + * + * Returns true iff every collected source file has matching mtime+size in + * `file_hashes` and no DB-tracked file has been removed. When true, the + * caller can short-circuit before invoking the native orchestrator — + * matching WASM's ~20 ms early-exit path and avoiding the ~2s flat + * per-call native rebuild overhead seen in CI (#1054). + * + * Intentionally Tier-0/Tier-1 only (journal + mtime/size). Tier-2 content + * hashing is left to the native side: when this returns false the caller + * falls through to the orchestrator, which performs its own complete + * detection and is the source of truth. + * + * Conservatively returns false when CFG or dataflow analysis is enabled + * but the corresponding tables are empty — otherwise the fast-skip would + * silently suppress the pending-analysis pass that the JS path runs via + * `runPendingAnalysis`, and CFG/dataflow data would never populate on + * repos where source files don't change between builds. + * + * Pure read of `db` and the filesystem — never mutates either. + */ +export function detectNoChanges( + db: BetterSqlite3Database, + allFiles: string[], + rootDir: string, + opts?: Record, +): boolean { + const log = makeFastSkipLogger(); + const existing = loadFileHashesForPreflight(db, log); + if (!existing) return false; + + if (!allFilesMatchStoredStat(existing, allFiles, rootDir, log)) return false; + if (!passesPendingAnalysisGuard(db, opts, log)) return false; log(`true: all checks passed (${allFiles.length} files)`); return true; diff --git a/src/domain/graph/builder/stages/finalize.ts b/src/domain/graph/builder/stages/finalize.ts index d59fe016a..ab2e1d429 100644 --- a/src/domain/graph/builder/stages/finalize.ts +++ b/src/domain/graph/builder/stages/finalize.ts @@ -136,82 +136,72 @@ function persistBuildMetadata( } } -/** - * Run advisory checks on full builds: orphaned embeddings, stale embeddings, - * and unused exports. Informational only — does not affect correctness. - */ -function runAdvisoryChecks(ctx: PipelineContext, hasEmbeddings: boolean, buildNow: Date): void { - // Batched native path: single napi call for all 3 advisory checks - if (ctx.engineName === 'native' && ctx.nativeDb?.runAdvisoryChecks) { - const result = ctx.nativeDb.runAdvisoryChecks(hasEmbeddings); - if (result.orphanedEmbeddings > 0) { - warn( - `${result.orphanedEmbeddings} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, - ); - } - if (result.embedBuiltAt) { - const embedTime = new Date(result.embedBuiltAt).getTime(); - if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { - warn( - 'Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.', - ); - } - } - if (result.unusedExports > 0) { - warn( - `${result.unusedExports} exported symbol${result.unusedExports > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`, - ); +/** Format the "X exports have zero consumers" warning, with correct plural agreement. */ +function unusedExportsMessage(count: number): string { + return `${count} exported symbol${count > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`; +} + +/** Run all three advisory checks via the batched native FFI. */ +function runAdvisoryChecksNative( + ctx: PipelineContext, + hasEmbeddings: boolean, + buildNow: Date, +): void { + const result = ctx.nativeDb!.runAdvisoryChecks!(hasEmbeddings); + if (result.orphanedEmbeddings > 0) { + warn( + `${result.orphanedEmbeddings} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, + ); + } + if (result.embedBuiltAt) { + const embedTime = new Date(result.embedBuiltAt).getTime(); + if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { + warn('Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.'); } - return; } + if (result.unusedExports > 0) { + warn(unusedExportsMessage(result.unusedExports)); + } +} - const { db } = ctx; - - // Orphaned embeddings warning - if (hasEmbeddings) { - try { - const orphaned = ( - db - .prepare( - 'SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)', - ) - .get() as { c: number } - ).c; - if (orphaned > 0) { - warn( - `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, - ); - } - } catch { - /* ignore - embeddings table may have been dropped */ +function checkOrphanedEmbeddings(ctx: PipelineContext): void { + try { + const orphaned = ( + ctx.db + .prepare('SELECT COUNT(*) as c FROM embeddings WHERE node_id NOT IN (SELECT id FROM nodes)') + .get() as { c: number } + ).c; + if (orphaned > 0) { + warn( + `${orphaned} embeddings are orphaned (nodes changed). Run "codegraph embed" to refresh.`, + ); } + } catch { + /* ignore - embeddings table may have been dropped */ } +} - // Stale embeddings warning (built before current graph rebuild) - if (hasEmbeddings) { - try { - const embedBuiltAt = ( - db.prepare("SELECT value FROM embedding_meta WHERE key = 'built_at'").get() as - | { value: string } - | undefined - )?.value; - if (embedBuiltAt) { - const embedTime = new Date(embedBuiltAt).getTime(); - if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { - warn( - 'Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.', - ); - } - } - } catch { - /* ignore - embedding_meta table may not exist */ +function checkStaleEmbeddings(ctx: PipelineContext, buildNow: Date): void { + try { + const embedBuiltAt = ( + ctx.db.prepare("SELECT value FROM embedding_meta WHERE key = 'built_at'").get() as + | { value: string } + | undefined + )?.value; + if (!embedBuiltAt) return; + const embedTime = new Date(embedBuiltAt).getTime(); + if (!Number.isNaN(embedTime) && embedTime < buildNow.getTime()) { + warn('Embeddings were built before the last graph rebuild. Run "codegraph embed" to update.'); } + } catch { + /* ignore - embedding_meta table may not exist */ } +} - // Unused exports warning +function checkUnusedExports(ctx: PipelineContext): void { try { const unusedCount = ( - db + ctx.db .prepare( `SELECT COUNT(*) as c FROM nodes WHERE exported = 1 AND kind != 'file' @@ -224,16 +214,28 @@ function runAdvisoryChecks(ctx: PipelineContext, hasEmbeddings: boolean, buildNo ) .get() as { c: number } ).c; - if (unusedCount > 0) { - warn( - `${unusedCount} exported symbol${unusedCount > 1 ? 's have' : ' has'} zero cross-file consumers. Run "codegraph exports --unused" to inspect.`, - ); - } + if (unusedCount > 0) warn(unusedExportsMessage(unusedCount)); } catch { /* exported column may not exist on older DBs */ } } +/** + * Run advisory checks on full builds: orphaned embeddings, stale embeddings, + * and unused exports. Informational only — does not affect correctness. + */ +function runAdvisoryChecks(ctx: PipelineContext, hasEmbeddings: boolean, buildNow: Date): void { + if (ctx.engineName === 'native' && ctx.nativeDb?.runAdvisoryChecks) { + runAdvisoryChecksNative(ctx, hasEmbeddings, buildNow); + return; + } + if (hasEmbeddings) { + checkOrphanedEmbeddings(ctx); + checkStaleEmbeddings(ctx, buildNow); + } + checkUnusedExports(ctx); +} + export async function finalize(ctx: PipelineContext): Promise { const { allSymbols, rootDir, isFullBuild, hasEmbeddings, opts } = ctx; diff --git a/src/domain/graph/builder/stages/insert-nodes.ts b/src/domain/graph/builder/stages/insert-nodes.ts index 88e403ec9..09aad25d8 100644 --- a/src/domain/graph/builder/stages/insert-nodes.ts +++ b/src/domain/graph/builder/stages/insert-nodes.ts @@ -92,23 +92,69 @@ function marshalSymbolBatches(allSymbols: Map): InsertN return batches; } +/** A single file_hashes row. */ +interface FileHashRecord { + file: string; + hash: string; + mtime: number; + size: number; +} + +/** Resolve the (hash, mtime, size) tuple for a relPath, reading from disk if needed. */ +function resolveHashFromPrecomputed( + relPath: string, + precomputed: PrecomputedFileData, + rootDir: string, + caller: string, +): FileHashRecord | null { + if (precomputed.hash) { + let mtime: number; + let size: number; + if (precomputed.stat) { + mtime = precomputed.stat.mtime; + size = precomputed.stat.size; + } else { + const rawStat = fileStat(path.join(rootDir, relPath)); + mtime = rawStat ? rawStat.mtime : 0; + size = rawStat ? rawStat.size : 0; + } + return { file: relPath, hash: precomputed.hash, mtime, size }; + } + + const absPath = path.join(rootDir, relPath); + let code: string | null; + try { + code = readFileSafe(absPath); + } catch (e) { + debug(`${caller}: readFileSafe failed for ${relPath}: ${toErrorMessage(e)}`); + code = null; + } + if (code === null) return null; + const stat = fileStat(absPath); + return { + file: relPath, + hash: fileHash(code), + mtime: stat ? stat.mtime : 0, + size: stat ? stat.size : 0, + }; +} + /** - * Build file hash entries for every collected file, including those that - * produced zero symbols (empty files, parsers that silently no-op'd, or - * optional-language extensions whose grammar wasn't installed). Iterating the - * symbol map instead would skip such files and leave them missing from - * `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on - * any subsequent no-op rebuild (#1068). + * Walk every collected file once and yield a `FileHashRecord` for it, plus one + * record per metadata-only update. Shared by `buildFileHashes` (native path) + * and `updateFileHashes` (JS fallback) so the iteration and hash-resolution + * logic stays in one place. * - * Exported for unit testing. + * Files marked `_reverseDepOnly` are skipped — their hashes are already + * correct in the DB. */ -export function buildFileHashes( +function* iterFileHashRecords( filesToParse: FileToParse[], precomputedData: Map, metadataUpdates: MetadataUpdate[], rootDir: string, -): Array<{ file: string; hash: string; mtime: number; size: number }> { - const fileHashes: Array<{ file: string; hash: string; mtime: number; size: number }> = []; + caller: string, +): Generator { const seen = new Set(); for (const item of filesToParse) { @@ -117,47 +163,53 @@ export function buildFileHashes( seen.add(relPath); const precomputed = precomputedData.get(relPath); - if (precomputed?._reverseDepOnly) { - continue; // file unchanged, hash already correct - } - if (precomputed?.hash) { - let mtime: number; - let size: number; - if (precomputed.stat) { - mtime = precomputed.stat.mtime; - size = precomputed.stat.size; - } else { - const rawStat = fileStat(path.join(rootDir, relPath)); - mtime = rawStat ? rawStat.mtime : 0; - size = rawStat ? rawStat.size : 0; - } - fileHashes.push({ file: relPath, hash: precomputed.hash, mtime, size }); - } else { - const absPath = path.join(rootDir, relPath); - let code: string | null; - try { - code = readFileSafe(absPath); - } catch (e) { - debug(`buildFileHashes: readFileSafe failed for ${relPath}: ${toErrorMessage(e)}`); - code = null; - } - if (code !== null) { - const stat = fileStat(absPath); - const mtime = stat ? stat.mtime : 0; - const size = stat ? stat.size : 0; - fileHashes.push({ file: relPath, hash: fileHash(code), mtime, size }); - } - } + if (precomputed?._reverseDepOnly) continue; + + const record = resolveHashFromPrecomputed( + relPath, + precomputed ?? ({} as PrecomputedFileData), + rootDir, + caller, + ); + if (record) yield record; } - // Also include metadata-only updates (self-heal mtime/size without re-parse) + // Metadata-only updates (self-heal mtime/size without re-parse) for (const item of metadataUpdates) { - const mtime = item.stat ? item.stat.mtime : 0; - const size = item.stat ? item.stat.size : 0; - fileHashes.push({ file: item.relPath, hash: item.hash, mtime, size }); + yield { + file: item.relPath, + hash: item.hash, + mtime: item.stat ? item.stat.mtime : 0, + size: item.stat ? item.stat.size : 0, + }; } +} - return fileHashes; +/** + * Build file hash entries for every collected file, including those that + * produced zero symbols (empty files, parsers that silently no-op'd, or + * optional-language extensions whose grammar wasn't installed). Iterating the + * symbol map instead would skip such files and leave them missing from + * `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on + * any subsequent no-op rebuild (#1068). + * + * Exported for unit testing. + */ +export function buildFileHashes( + filesToParse: FileToParse[], + precomputedData: Map, + metadataUpdates: MetadataUpdate[], + rootDir: string, +): FileHashRecord[] { + return [ + ...iterFileHashRecords( + filesToParse, + precomputedData, + metadataUpdates, + rootDir, + 'buildFileHashes', + ), + ]; } // ── Native fast-path ───────────────────────────────────────────────── @@ -260,36 +312,38 @@ function insertDefinitionsAndExports( // ── JS fallback: Phase 2+3 ────────────────────────────────────────── -function insertChildrenAndEdges( +/** Build the in-memory `name|kind|line` → node-id map for a single file. */ +function loadFileNodeIdMap(db: BetterSqlite3Database, relPath: string): Map { + const map = new Map(); + for (const row of bulkNodeIdsByFile(db, relPath)) { + map.set(`${row.name}|${row.kind}|${row.line}`, row.id); + } + return map; +} + +/** + * First pass: for every file, emit file→def containment edges and collect + * the child-node insertion rows. + */ +function collectChildRowsAndFileEdges( db: BetterSqlite3Database, allSymbols: Map, + childRows: unknown[][], + edgeRows: unknown[][], ): void { - const childRows: unknown[][] = []; - const edgeRows: unknown[][] = []; - for (const [relPath, symbols] of allSymbols) { - // First pass: collect file→def edges and child rows - const nodeIdMap = new Map(); - for (const row of bulkNodeIdsByFile(db, relPath)) { - nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); - } - + const nodeIdMap = loadFileNodeIdMap(db, relPath); const fileId = nodeIdMap.get(`${relPath}|file|0`); for (const def of symbols.definitions) { const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); - // Containment edge: file -> definition if (fileId && defId) { edgeRows.push([fileId, defId, 'contains', 1.0, 0]); } - - if (!def.children?.length) continue; - if (!defId) continue; + if (!def.children?.length || !defId) continue; for (const child of def.children) { - // Child node - const qualifiedName = `${def.name}.${child.name}`; childRows.push([ child.name, child.kind, @@ -297,39 +351,55 @@ function insertChildrenAndEdges( child.line, child.endLine || null, defId, - qualifiedName, + `${def.name}.${child.name}`, def.name, child.visibility || null, ]); } } } +} - // Insert children first (so they exist for edge lookup) - batchInsertNodes(db, childRows); - - // Now re-fetch IDs to include newly-inserted children, then add child edges +/** + * Second pass (after child nodes have been inserted): emit def→child + * containment edges and child→def `parameter_of` edges. + */ +function collectChildEdges( + db: BetterSqlite3Database, + allSymbols: Map, + edgeRows: unknown[][], +): void { for (const [relPath, symbols] of allSymbols) { - const nodeIdMap = new Map(); - for (const row of bulkNodeIdsByFile(db, relPath)) { - nodeIdMap.set(`${row.name}|${row.kind}|${row.line}`, row.id); - } + const nodeIdMap = loadFileNodeIdMap(db, relPath); for (const def of symbols.definitions) { if (!def.children?.length) continue; const defId = nodeIdMap.get(`${def.name}|${def.kind}|${def.line}`); if (!defId) continue; for (const child of def.children) { const childId = nodeIdMap.get(`${child.name}|${child.kind}|${child.line}`); - if (childId) { - edgeRows.push([defId, childId, 'contains', 1.0, 0]); - if (child.kind === 'parameter') { - edgeRows.push([childId, defId, 'parameter_of', 1.0, 0]); - } + if (!childId) continue; + edgeRows.push([defId, childId, 'contains', 1.0, 0]); + if (child.kind === 'parameter') { + edgeRows.push([childId, defId, 'parameter_of', 1.0, 0]); } } } } +} + +function insertChildrenAndEdges( + db: BetterSqlite3Database, + allSymbols: Map, +): void { + const childRows: unknown[][] = []; + const edgeRows: unknown[][] = []; + collectChildRowsAndFileEdges(db, allSymbols, childRows, edgeRows); + + // Insert children first (so they exist for edge lookup) + batchInsertNodes(db, childRows); + + collectChildEdges(db, allSymbols, edgeRows); batchInsertEdges(db, edgeRows); } @@ -348,50 +418,14 @@ function updateFileHashes( // Iterate every collected file (#1068): files that produced zero symbols // (empty, parser no-op, or grammar-missing optional language) still need a // hash row, otherwise the next no-op rebuild's fast-skip pre-flight rejects. - const seen = new Set(); - for (const item of filesToParse) { - const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file)); - if (seen.has(relPath)) continue; - seen.add(relPath); - - const precomputed = precomputedData.get(relPath); - if (precomputed?._reverseDepOnly) { - // no-op: file unchanged, hash already correct - } else if (precomputed?.hash) { - let mtime: number; - let size: number; - if (precomputed.stat) { - mtime = precomputed.stat.mtime; - size = precomputed.stat.size; - } else { - const rawStat = fileStat(path.join(rootDir, relPath)); - mtime = rawStat ? rawStat.mtime : 0; - size = rawStat ? rawStat.size : 0; - } - upsertHash.run(relPath, precomputed.hash, mtime, size); - } else { - const absPath = path.join(rootDir, relPath); - let code: string | null; - try { - code = readFileSafe(absPath); - } catch (e) { - debug(`updateFileHashes: readFileSafe failed for ${relPath}: ${toErrorMessage(e)}`); - code = null; - } - if (code !== null) { - const stat = fileStat(absPath); - const mtime = stat ? stat.mtime : 0; - const size = stat ? stat.size : 0; - upsertHash.run(relPath, fileHash(code), mtime, size); - } - } - } - - // Also update metadata-only entries (self-heal mtime/size without re-parse) - for (const item of metadataUpdates) { - const mtime = item.stat ? item.stat.mtime : 0; - const size = item.stat ? item.stat.size : 0; - upsertHash.run(item.relPath, item.hash, mtime, size); + for (const record of iterFileHashRecords( + filesToParse, + precomputedData, + metadataUpdates, + rootDir, + 'updateFileHashes', + )) { + upsertHash.run(record.file, record.hash, record.mtime, record.size); } } From 40d418d149d06c127d7ce430f4863d5b72b93edc Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:50:10 -0600 Subject: [PATCH 12/27] refactor(graph): extract helpers in cycles and journal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docs check acknowledged — no doc-relevant changes (internal helper extraction). --- src/domain/graph/cycles.ts | 100 +++++++++++------------ src/domain/graph/journal.ts | 153 ++++++++++++++++++++---------------- 2 files changed, 135 insertions(+), 118 deletions(-) diff --git a/src/domain/graph/cycles.ts b/src/domain/graph/cycles.ts index 4ccc872f2..bb4d61168 100644 --- a/src/domain/graph/cycles.ts +++ b/src/domain/graph/cycles.ts @@ -3,6 +3,45 @@ import { loadNative } from '../../infrastructure/native.js'; import { isTestFile } from '../../infrastructure/test-filter.js'; import type { BetterSqlite3Database } from '../../types.js'; +type Edge = { source: string; target: string }; +type DbEdge = { source_id: number; target_id: number }; + +/** + * Build a label-based edge list from DB rows, filtering to known nodes and + * deduplicating. Self-loops are skipped (Tarjan treats them as trivial SCCs). + */ +function buildLabelEdges(dbEdges: DbEdge[], idToLabel: Map): Edge[] { + const edges: Edge[] = []; + const seen = new Set(); + for (const e of dbEdges) { + if (e.source_id === e.target_id) continue; + const src = idToLabel.get(e.source_id); + const tgt = idToLabel.get(e.target_id); + if (src === undefined || tgt === undefined) continue; + const key = `${src}\0${tgt}`; + if (seen.has(key)) continue; + seen.add(key); + edges.push({ source: src, target: tgt }); + } + return edges; +} + +function buildFileLevelEdges(db: BetterSqlite3Database, noTests: boolean): Edge[] { + let nodes = getFileNodesAll(db); + if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); + const idToLabel = new Map(); + for (const n of nodes) idToLabel.set(n.id, n.file); + return buildLabelEdges(getImportEdges(db), idToLabel); +} + +function buildCallableEdges(db: BetterSqlite3Database, noTests: boolean): Edge[] { + let nodes = getCallableNodes(db); + if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); + const idToLabel = new Map(); + for (const n of nodes) idToLabel.set(n.id, `${n.name}|${n.file}`); + return buildLabelEdges(getCallEdges(db), idToLabel); +} + /** * Find cycles using Tarjan's SCC algorithm. * @@ -16,66 +55,20 @@ export function findCycles( const fileLevel = opts.fileLevel !== false; const noTests = opts.noTests || false; - const edges: Array<{ source: string; target: string }> = []; - const seen = new Set(); - - if (fileLevel) { - let nodes = getFileNodesAll(db); - if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); - const nodeIds = new Set(); - const idToFile = new Map(); - for (const n of nodes) { - nodeIds.add(n.id); - idToFile.set(n.id, n.file); - } - for (const e of getImportEdges(db)) { - if (!nodeIds.has(e.source_id) || !nodeIds.has(e.target_id)) continue; - if (e.source_id === e.target_id) continue; - const src = idToFile.get(e.source_id)!; - const tgt = idToFile.get(e.target_id)!; - const key = `${src}\0${tgt}`; - if (seen.has(key)) continue; - seen.add(key); - edges.push({ source: src, target: tgt }); - } - } else { - let nodes = getCallableNodes(db); - if (noTests) nodes = nodes.filter((n) => !isTestFile(n.file)); - const nodeIds = new Set(); - const idToLabel = new Map(); - for (const n of nodes) { - nodeIds.add(n.id); - idToLabel.set(n.id, `${n.name}|${n.file}`); - } - for (const e of getCallEdges(db)) { - if (!nodeIds.has(e.source_id) || !nodeIds.has(e.target_id)) continue; - if (e.source_id === e.target_id) continue; - const src = idToLabel.get(e.source_id)!; - const tgt = idToLabel.get(e.target_id)!; - const key = `${src}\0${tgt}`; - if (seen.has(key)) continue; - seen.add(key); - edges.push({ source: src, target: tgt }); - } - } + const edges = fileLevel ? buildFileLevelEdges(db, noTests) : buildCallableEdges(db, noTests); const native = loadNative(); if (native) { return native.detectCycles(edges) as string[][]; } - return tarjanFromEdges(edges); } -export function findCyclesJS(edges: Array<{ source: string; target: string }>): string[][] { +export function findCyclesJS(edges: Edge[]): string[][] { return tarjanFromEdges(edges); } -/** - * Run Tarjan's SCC on a flat edge list. Returns SCCs with length > 1 (cycles). - * Uses a simple adjacency-list Map instead of a full CodeGraph. - */ -function tarjanFromEdges(edges: Array<{ source: string; target: string }>): string[][] { +function buildAdjacency(edges: Edge[]): { adj: Map; allNodes: Set } { const adj = new Map(); const allNodes = new Set(); for (const { source, target } of edges) { @@ -88,6 +81,15 @@ function tarjanFromEdges(edges: Array<{ source: string; target: string }>): stri } list.push(target); } + return { adj, allNodes }; +} + +/** + * Run Tarjan's SCC on a flat edge list. Returns SCCs with length > 1 (cycles). + * Uses a simple adjacency-list Map instead of a full CodeGraph. + */ +function tarjanFromEdges(edges: Edge[]): string[][] { + const { adj, allNodes } = buildAdjacency(edges); let index = 0; const stack: string[] = []; diff --git a/src/domain/graph/journal.ts b/src/domain/graph/journal.ts index 900e33546..d20c7dab9 100644 --- a/src/domain/graph/journal.ts +++ b/src/domain/graph/journal.ts @@ -91,62 +91,69 @@ function trySteal(lockPath: string): AcquiredLock | null { return { fd, nonce }; } -function acquireJournalLock(lockPath: string): AcquiredLock { - const start = Date.now(); - for (;;) { - const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; +/** + * Try to create the lockfile fresh via `wx`. Returns the acquired lock on + * success, `null` if another holder exists, or throws on unexpected errors. + * + * If the stamp write fails (ENOSPC, I/O error) we release the empty file — + * leaving it would look stale to concurrent waiters and admit double-acquire. + */ +function tryFreshAcquire(lockPath: string): AcquiredLock | null { + const nonce = `${process.pid}-${crypto.randomBytes(8).toString('hex')}`; + let fd: number; + try { + fd = fs.openSync(lockPath, 'wx'); + } catch (e) { + if ((e as NodeJS.ErrnoException).code === 'EEXIST') return null; + throw e; + } + try { + fs.writeSync(fd, `${process.pid}\n${nonce}\n`); + } catch { try { - const fd = fs.openSync(lockPath, 'wx'); - try { - fs.writeSync(fd, `${process.pid}\n${nonce}\n`); - } catch { - // Stamp write failed (ENOSPC, I/O error). An empty lockfile would - // look stale to concurrent waiters (Number('') === 0, isPidAlive(0) - // returns false), so they'd steal our live lock. Release and retry. - try { - fs.closeSync(fd); - } catch { - /* ignore */ - } - try { - fs.unlinkSync(lockPath); - } catch { - /* ignore */ - } - if (Date.now() - start > LOCK_TIMEOUT_MS) { - throw new Error( - `Failed to acquire journal lock at ${lockPath} within ${LOCK_TIMEOUT_MS}ms`, - ); - } - sleepSync(LOCK_RETRY_MS); - continue; - } - return { fd, nonce }; - } catch (e) { - if ((e as NodeJS.ErrnoException).code !== 'EEXIST') throw e; + fs.closeSync(fd); + } catch { + /* ignore */ } - - let holderAlive = true; try { - const pidContent = fs.readFileSync(lockPath, 'utf-8').split('\n')[0]!.trim(); - holderAlive = isPidAlive(Number(pidContent)); + fs.unlinkSync(lockPath); } catch { - /* unreadable — fall through to age check */ + /* ignore */ } + return null; + } + return { fd, nonce }; +} - let shouldSteal = !holderAlive; - if (holderAlive) { - try { - const stat = fs.statSync(lockPath); - if (Date.now() - stat.mtimeMs > LOCK_STALE_MS) { - shouldSteal = true; - } - } catch { - /* stat failed — keep retrying */ - } - } +/** + * Decide whether the current lock holder is stale and should be stolen. + * Returns true if the PID is dead, or if the lockfile mtime exceeds the + * staleness threshold. + */ +function isLockStale(lockPath: string): boolean { + let holderAlive = true; + try { + const pidContent = fs.readFileSync(lockPath, 'utf-8').split('\n')[0]!.trim(); + holderAlive = isPidAlive(Number(pidContent)); + } catch { + /* unreadable — fall through to age check */ + } + if (!holderAlive) return true; + try { + const stat = fs.statSync(lockPath); + return Date.now() - stat.mtimeMs > LOCK_STALE_MS; + } catch { + return false; + } +} - if (shouldSteal) { +function acquireJournalLock(lockPath: string): AcquiredLock { + const start = Date.now(); + for (;;) { + const fresh = tryFreshAcquire(lockPath); + if (fresh) return fresh; + + if (isLockStale(lockPath)) { const stolen = trySteal(lockPath); if (stolen) return stolen; // Steal failed or lost the race — fall through to timeout check & retry. @@ -227,27 +234,20 @@ interface JournalResult { removed?: string[]; } -export function readJournal(rootDir: string): JournalResult { - const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); - let content: string; - try { - content = fs.readFileSync(journalPath, 'utf-8'); - } catch { - return { valid: false }; - } - - const lines = content.split('\n'); - if (lines.length === 0 || !lines[0]!.startsWith(HEADER_PREFIX)) { +function parseJournalHeader(firstLine: string | undefined): number | null { + if (!firstLine || !firstLine.startsWith(HEADER_PREFIX)) { debug('Journal has malformed or missing header'); - return { valid: false }; + return null; } - - const timestamp = Number(lines[0]!.slice(HEADER_PREFIX.length).trim()); + const timestamp = Number(firstLine.slice(HEADER_PREFIX.length).trim()); if (!Number.isFinite(timestamp) || timestamp <= 0) { debug('Journal has invalid timestamp'); - return { valid: false }; + return null; } + return timestamp; +} +function parseJournalBody(lines: string[]): { changed: string[]; removed: string[] } { const changed: string[] = []; const removed: string[] = []; const seenChanged = new Set(); @@ -263,14 +263,29 @@ export function readJournal(rootDir: string): JournalResult { seenRemoved.add(filePath); removed.push(filePath); } - } else { - if (!seenChanged.has(line)) { - seenChanged.add(line); - changed.push(line); - } + } else if (!seenChanged.has(line)) { + seenChanged.add(line); + changed.push(line); } } + return { changed, removed }; +} + +export function readJournal(rootDir: string): JournalResult { + const journalPath = path.join(rootDir, '.codegraph', JOURNAL_FILENAME); + let content: string; + try { + content = fs.readFileSync(journalPath, 'utf-8'); + } catch { + return { valid: false }; + } + + const lines = content.split('\n'); + const timestamp = parseJournalHeader(lines[0]); + if (timestamp === null) return { valid: false }; + + const { changed, removed } = parseJournalBody(lines); return { valid: true, timestamp, changed, removed }; } From b3c36f41e0d410eb23022fd4197c7bf4ff82cc57 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 12:54:24 -0600 Subject: [PATCH 13/27] refactor(core-rs): collapse walker mutual recursion into single-entry traversal --- crates/codegraph-core/src/dataflow.rs | 73 ++++++++++----------------- 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index 26ea2d21c..091b44dd0 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -971,6 +971,12 @@ struct DataflowOutput { mutations: Vec, } +/// Single-entry DFS traversal for dataflow extraction. +/// +/// Dispatches the current node to the appropriate handler, then recurses into +/// named children by calling `visit` directly. Children are walked inline to +/// avoid a `visit` <-> `visit_children` mutual-recursion cycle (single entry +/// point, single recursive call site). #[allow(clippy::too_many_arguments)] fn visit( node: &Node, @@ -985,66 +991,43 @@ fn visit( } let t = node.kind(); + let mut entered_scope = false; - // Enter function scope + // Dispatch to handler for this node kind. Children are always visited + // afterwards via the loop below — handlers must not recurse themselves. if is_function_node(rules, t) { enter_scope(node, rules, source, scope_stack, &mut out.parameters); - visit_children(node, rules, source, scope_stack, out, depth); - scope_stack.pop(); - return; - } - - // Return statements - if rules.return_node.is_some_and(|r| r == t) { + entered_scope = true; + } else if rules.return_node.is_some_and(|r| r == t) { handle_return_stmt(node, rules, source, scope_stack, &mut out.returns, depth); - visit_children(node, rules, source, scope_stack, out, depth); - return; - } - - // Variable declarations (single or multi-type) - if rules.var_declarator_node.is_some_and(|v| v == t) + } else if rules.var_declarator_node.is_some_and(|v| v == t) || (!rules.var_declarator_nodes.is_empty() && rules.var_declarator_nodes.contains(&t)) { handle_var_declarator(node, rules, source, scope_stack, &mut out.assignments); - visit_children(node, rules, source, scope_stack, out, depth); - return; - } - - // Call expressions - if is_call_node(rules, t) { + } else if is_call_node(rules, t) { handle_call_expr(node, rules, source, scope_stack, &mut out.arg_flows); - visit_children(node, rules, source, scope_stack, out, depth); - return; - } - - // Assignment expressions - if rules.assignment_node.is_some_and(|a| a == t) { - handle_assignment(node, rules, source, scope_stack, &mut out.assignments, &mut out.mutations); - visit_children(node, rules, source, scope_stack, out, depth); - return; - } - - // Mutation detection via expression_statement - if t == rules.expression_stmt_node { + } else if rules.assignment_node.is_some_and(|a| a == t) { + handle_assignment( + node, + rules, + source, + scope_stack, + &mut out.assignments, + &mut out.mutations, + ); + } else if t == rules.expression_stmt_node { handle_expr_stmt_mutation(node, rules, source, scope_stack, &mut out.mutations); } - visit_children(node, rules, source, scope_stack, out, depth); -} - -/// Visit all named children of a node (shared DFS recursion helper). -fn visit_children( - node: &Node, - rules: &DataflowRules, - source: &[u8], - scope_stack: &mut Vec, - out: &mut DataflowOutput, - depth: usize, -) { + // Recurse into named children inline — no helper indirection, no cycle. let cursor = &mut node.walk(); for child in node.named_children(cursor) { visit(&child, rules, source, scope_stack, out, depth + 1); } + + if entered_scope { + scope_stack.pop(); + } } /// Handle a return statement: extract expression and referenced names. From b49cab5dd94e96bc6b7b1e278df6f179f662800f Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:15:47 -0600 Subject: [PATCH 14/27] refactor(core-rs): decompose pipeline, read queries, and edge builders docs check acknowledged - Rust internal helper extraction, no user-facing changes --- crates/codegraph-core/src/build_pipeline.rs | 1026 +++++++------ crates/codegraph-core/src/edge_builder.rs | 243 +-- crates/codegraph-core/src/graph_algorithms.rs | 109 +- crates/codegraph-core/src/import_edges.rs | 236 +-- .../codegraph-core/src/import_resolution.rs | 125 +- crates/codegraph-core/src/read_queries.rs | 1347 ++++++++++------- crates/codegraph-core/src/structure.rs | 401 +++-- 7 files changed, 2001 insertions(+), 1486 deletions(-) diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index dba6e7f28..8691611d6 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -96,21 +96,23 @@ fn relative_path(root_dir: &str, abs_path: &str) -> String { } } -/// Run the full build pipeline in Rust. -/// -/// Called from `NativeDatabase.build_graph()` via napi. -pub fn run_pipeline( +/// Deserialized pipeline inputs assembled in Stage 1. +struct PipelineSetup { + config: BuildConfig, + napi_aliases: crate::types::PathAliases, + opts: BuildOpts, + incremental: bool, + include_dataflow: bool, + include_ast: bool, + force_full_rebuild: bool, +} + +fn pipeline_setup( conn: &Connection, - root_dir: &str, config_json: &str, aliases_json: &str, opts_json: &str, -) -> Result { - let total_start = Instant::now(); - let mut timing = PipelineTiming::default(); - - // ── Stage 1: Deserialize config ──────────────────────────────────── - let t0 = Instant::now(); +) -> Result { let config: BuildConfig = serde_json::from_str(config_json).map_err(|e| format!("config parse error: {e}"))?; let aliases: BuildPathAliases = @@ -122,9 +124,315 @@ pub fn run_pipeline( let incremental = opts.incremental.unwrap_or(config.build.incremental); let include_dataflow = opts.dataflow.unwrap_or(true); let include_ast = opts.ast.unwrap_or(true); - - // Check engine/schema/version mismatch for forced full rebuild let force_full_rebuild = check_version_mismatch(conn); + + Ok(PipelineSetup { + config, + napi_aliases, + opts, + incremental, + include_dataflow, + include_ast, + force_full_rebuild, + }) +} + +/// Build a no-op early-exit result when no source files changed and we are +/// in an incremental build with no removals. Mirrors the early-exit branch +/// in `run_pipeline` exactly so it can be lifted out without behaviour change. +fn early_exit_result( + file_count: usize, + timing: PipelineTiming, + conn: &Connection, + root_dir: &str, + metadata_updates: &[change_detection::MetadataUpdate], +) -> BuildPipelineResult { + change_detection::heal_metadata(conn, metadata_updates); + journal::write_journal_header(root_dir, now_ms()); + BuildPipelineResult { + phases: timing, + node_count: 0, + edge_count: 0, + file_count, + early_exit: true, + changed_files: Some(vec![]), + changed_count: 0, + removed_count: 0, + is_full_build: false, + structure_handled: true, + analysis_complete: true, + } +} + +/// Save reverse-dep edges (and reverse-deps of removed files) before purging +/// changed files. Mirrors the JS save-then-purge sequence in `build-edges.ts` +/// (#1012). Returns `(saved_reverse_dep_edges, removal_reverse_deps)` so the +/// pipeline can reconnect them after Stage 5 and reclassify roles in Stage 8. +fn save_and_purge_changed( + conn: &Connection, + parse_changes: &[&change_detection::ChangedFile], + change_result: &change_detection::ChangeResult, + opts: &BuildOpts, + root_dir: &str, +) -> (Vec, Vec) { + let mut saved_reverse_dep_edges: Vec = Vec::new(); + let mut removal_reverse_deps: Vec = Vec::new(); + + if change_result.is_full_build { + let has_embeddings = change_detection::has_embeddings(conn); + change_detection::clear_all_graph_data(conn, has_embeddings); + return (saved_reverse_dep_edges, removal_reverse_deps); + } + + let changed_paths: Vec = parse_changes.iter().map(|c| c.rel_path.clone()).collect(); + + if !opts.no_reverse_deps.unwrap_or(false) { + saved_reverse_dep_edges = change_detection::save_reverse_dep_edges(conn, &changed_paths); + + if !change_result.removed.is_empty() { + let removed_set: HashSet = change_result.removed.iter().cloned().collect(); + removal_reverse_deps = + change_detection::find_reverse_dependencies(conn, &removed_set, root_dir) + .into_iter() + .collect(); + } + } + + let files_to_purge: Vec = change_result + .removed + .iter() + .chain(parse_changes.iter().map(|c| &c.rel_path)) + .cloned() + .collect(); + change_detection::purge_changed_files(conn, &files_to_purge, &[]); + + (saved_reverse_dep_edges, removal_reverse_deps) +} + +/// Parse a changed-file slice in parallel and key the results by relative path. +fn parse_and_index_files( + parse_changes: &[&change_detection::ChangedFile], + root_dir: &str, + include_dataflow: bool, + include_ast: bool, +) -> HashMap { + let files_to_parse: Vec = + parse_changes.iter().map(|c| c.abs_path.clone()).collect(); + let parsed = + parallel::parse_files_parallel(&files_to_parse, root_dir, include_dataflow, include_ast); + let mut file_symbols: HashMap = HashMap::new(); + for mut sym in parsed { + let rel = relative_path(root_dir, &sym.file); + sym.file = rel.clone(); + file_symbols.insert(rel, sym); + } + file_symbols +} + +/// Build the batched import-resolution input set and run resolution, returning +/// `(batch_resolved, known_files)`. Mirrors stage 6 of `run_pipeline`. +fn resolve_pipeline_imports( + file_symbols: &HashMap, + collect_files: &[String], + root_dir: &str, + napi_aliases: &crate::types::PathAliases, +) -> (HashMap, HashSet) { + let mut batch_inputs: Vec = Vec::new(); + for (rel_path, symbols) in file_symbols { + let abs_file = Path::new(root_dir).join(rel_path); + let abs_str = abs_file.to_str().unwrap_or("").replace('\\', "/"); + for imp in &symbols.imports { + batch_inputs.push(ImportResolutionInput { + from_file: abs_str.clone(), + import_source: imp.source.clone(), + }); + } + } + let known_files: HashSet = + collect_files.iter().map(|f| relative_path(root_dir, f)).collect(); + let resolved = + import_resolution::resolve_imports_batch(&batch_inputs, root_dir, napi_aliases, Some(&known_files)); + let mut batch_resolved: HashMap = HashMap::new(); + for r in &resolved { + let key = format!("{}|{}", r.from_file, r.import_source); + batch_resolved.insert(key, r.resolved_path.clone()); + } + (batch_resolved, known_files) +} + +/// Reconnect any saved reverse-dep edges to the new target node IDs (#1012). +fn reconnect_saved_reverse_dep_edges( + conn: &Connection, + saved: &[change_detection::SavedReverseDepEdge], +) { + if saved.is_empty() { + return; + } + let (reconnected, dropped) = change_detection::reconnect_reverse_dep_edges(conn, saved); + if dropped > 0 { + eprintln!( + "[codegraph] reconnect_reverse_dep_edges: {reconnected} reconnected, {dropped} dropped (target nodes not found)" + ); + } +} + +/// Stage 8 (structure): decide between the fast incremental path and a full +/// structure rebuild based on the same gates as the JS pipeline. The change +/// set is read from `file_symbols.keys()` because only truly-changed files +/// are present (reverse-deps are reconnected, not re-parsed). +fn run_structure_phase( + conn: &Connection, + file_symbols: &HashMap, + collect_directories: &HashSet, + root_dir: &str, + line_count_map: &HashMap, + parse_changes_len: usize, + is_full_build: bool, +) { + let changed_files: Vec = file_symbols.keys().cloned().collect(); + let existing_file_count = structure::get_existing_file_count(conn); + let use_fast_path = !is_full_build + && parse_changes_len <= FAST_PATH_MAX_CHANGED_FILES + && existing_file_count > FAST_PATH_MIN_EXISTING_FILES; + + if use_fast_path { + structure::update_changed_file_metrics(conn, &changed_files, line_count_map, file_symbols); + } else { + let changed_for_structure: Option> = if is_full_build { + None + } else { + Some(changed_files.clone()) + }; + structure::build_full_structure( + conn, + file_symbols, + collect_directories, + root_dir, + line_count_map, + changed_for_structure.as_deref(), + ); + } +} + +/// Stage 8 (roles): classify roles for the affected file set. Removal +/// reverse-deps need to be seeded explicitly because their fan-in/out can +/// no longer be discovered via neighbour expansion once the deleted file's +/// nodes are gone (#1027). +fn run_role_classification( + conn: &Connection, + file_symbols: &HashMap, + removal_reverse_deps: Vec, + is_full_build: bool, +) { + let changed_files: Vec = file_symbols.keys().cloned().collect(); + let changed_file_list: Option> = if is_full_build { + None + } else { + let mut files = changed_files; + if !removal_reverse_deps.is_empty() { + let existing: HashSet = files.iter().cloned().collect(); + for f in removal_reverse_deps { + if !existing.contains(&f) { + files.push(f); + } + } + } + Some(files) + }; + if let Some(ref files) = changed_file_list { + if !files.is_empty() { + let _ = roles_db::do_classify_incremental(conn, files); + } + } else { + let _ = roles_db::do_classify_full(conn); + } +} + +/// Stage 8b: persist AST, complexity, CFG, and dataflow data for the +/// analysis scope. Returns `(do_analysis, analysis_ok)` so the caller can +/// compute `analysis_complete`. +fn run_analysis_persistence( + conn: &Connection, + file_symbols: &HashMap, + analysis_scope: Option<&Vec>, + opts: &BuildOpts, + include_ast: bool, + include_dataflow: bool, + timing: &mut PipelineTiming, +) -> (bool, bool) { + let include_complexity = opts.complexity.unwrap_or(true); + let include_cfg = opts.cfg.unwrap_or(true); + let do_analysis = include_ast || include_dataflow || include_cfg || include_complexity; + if !do_analysis { + return (false, true); + } + + let analysis_file_set: HashSet<&str> = match analysis_scope { + Some(files) => files.iter().map(|s| s.as_str()).collect(), + None => file_symbols.keys().map(|s| s.as_str()).collect(), + }; + + let node_id_map = build_analysis_node_map(conn, &analysis_file_set); + let mut analysis_ok = true; + + if include_ast { + let t0 = Instant::now(); + let ast_batches = build_ast_batches(file_symbols, &analysis_file_set); + if ast_db::do_insert_ast_nodes(conn, &ast_batches).is_err() { + analysis_ok = false; + } + timing.ast_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + if include_complexity { + let t0 = Instant::now(); + if !write_complexity(conn, file_symbols, &analysis_file_set, &node_id_map) { + analysis_ok = false; + } + timing.complexity_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + if include_cfg { + let t0 = Instant::now(); + if !write_cfg(conn, file_symbols, &analysis_file_set, &node_id_map) { + analysis_ok = false; + } + timing.cfg_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + if include_dataflow { + let t0 = Instant::now(); + if !write_dataflow(conn, file_symbols, &analysis_file_set) { + analysis_ok = false; + } + timing.dataflow_ms = t0.elapsed().as_secs_f64() * 1000.0; + } + + (do_analysis, analysis_ok) +} + +/// Run the full build pipeline in Rust. +/// +/// Called from `NativeDatabase.build_graph()` via napi. +pub fn run_pipeline( + conn: &Connection, + root_dir: &str, + config_json: &str, + aliases_json: &str, + opts_json: &str, +) -> Result { + let total_start = Instant::now(); + let mut timing = PipelineTiming::default(); + + // ── Stage 1: Deserialize config ──────────────────────────────────── + let t0 = Instant::now(); + let setup = pipeline_setup(conn, config_json, aliases_json, opts_json)?; + let PipelineSetup { + config, + napi_aliases, + opts, + incremental, + include_dataflow, + include_ast, + force_full_rebuild, + } = setup; timing.setup_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 2: Collect files ───────────────────────────────────────── @@ -162,89 +470,27 @@ pub fn run_pipeline( // Early exit: no changes if !change_result.is_full_build && parse_changes.is_empty() && change_result.removed.is_empty() { - // Heal metadata if needed - change_detection::heal_metadata(conn, &change_result.metadata_updates); - journal::write_journal_header(root_dir, now_ms()); - return Ok(BuildPipelineResult { - phases: timing, - node_count: 0, - edge_count: 0, - file_count: collect_result.files.len(), - early_exit: true, - changed_files: Some(vec![]), - changed_count: 0, - removed_count: 0, - is_full_build: false, - structure_handled: true, - analysis_complete: true, - }); + return Ok(early_exit_result( + collect_result.files.len(), + timing, + conn, + root_dir, + &change_result.metadata_updates, + )); } - // Save reverse-dep → changed-file edges before purge so we can reconnect - // them to new node IDs after Stage 5 (#1012). This matches the WASM/JS - // strategy and lets us skip re-parsing reverse-dep files entirely: - // parse/insert/structure/roles/analysis all scope to truly-changed files. - let mut saved_reverse_dep_edges: Vec = Vec::new(); - // Files that import a removed file. Save+reconnect doesn't apply (the - // target node is gone for good), but their role records go stale because - // edges to the deleted file's nodes get purged in Stage 3. Reclassify them - // in Stage 8 so fan-out reflects reality. (#1027 review) - let mut removal_reverse_deps: Vec = Vec::new(); - - // Handle full build: clear all graph data - if change_result.is_full_build { - let has_embeddings = change_detection::has_embeddings(conn); - change_detection::clear_all_graph_data(conn, has_embeddings); - } else { - // Incremental: save reverse-dep edges (if reverse-dep tracking is enabled), - // then purge changed files only. - let changed_paths: Vec = - parse_changes.iter().map(|c| c.rel_path.clone()).collect(); - - if !opts.no_reverse_deps.unwrap_or(false) { - saved_reverse_dep_edges = - change_detection::save_reverse_dep_edges(conn, &changed_paths); - - if !change_result.removed.is_empty() { - let removed_set: HashSet = - change_result.removed.iter().cloned().collect(); - removal_reverse_deps = - change_detection::find_reverse_dependencies(conn, &removed_set, root_dir) - .into_iter() - .collect(); - } - } - - let files_to_purge: Vec = change_result - .removed - .iter() - .chain(parse_changes.iter().map(|c| &c.rel_path)) - .cloned() - .collect(); - // Pass empty reverse_dep_files: purge already deletes both directions - // for changed files (which removes the saved reverse-dep → changed-file - // edges from the live table), and other outgoing edges from reverse-dep - // files remain valid and must NOT be deleted — they will be reconnected - // to new target IDs after insert. - change_detection::purge_changed_files(conn, &files_to_purge, &[]); - } + // Stage 3b: save reverse-dep edges (incremental) or clear all (full), + // then purge changed files. Returns the saved edges for Stage 7 + // reconnect and the removal reverse-dep set for Stage 8 reclassification. + let (saved_reverse_dep_edges, removal_reverse_deps) = + save_and_purge_changed(conn, &parse_changes, &change_result, &opts, root_dir); // ── Stage 4: Parse files ─────────────────────────────────────────── // Only truly-changed files are parsed. Reverse-dep files are not re-parsed — // their edges to changed files are reconstructed via save+reconnect (#1012). let t0 = Instant::now(); - let files_to_parse: Vec = - parse_changes.iter().map(|c| c.abs_path.clone()).collect(); - let parsed = - parallel::parse_files_parallel(&files_to_parse, root_dir, include_dataflow, include_ast); - - // Build file symbols map (relative path → FileSymbols) - let mut file_symbols: HashMap = HashMap::new(); - for mut sym in parsed { - let rel = relative_path(root_dir, &sym.file); - sym.file = rel.clone(); - file_symbols.insert(rel, sym); - } + let mut file_symbols = + parse_and_index_files(&parse_changes, root_dir, include_dataflow, include_ast); timing.parse_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 5: Insert nodes ────────────────────────────────────────── @@ -257,44 +503,13 @@ pub fn run_pipeline( &file_hashes, &change_result.removed, ); - // Also heal metadata-only updates change_detection::heal_metadata(conn, &change_result.metadata_updates); timing.insert_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 6: Resolve imports ─────────────────────────────────────── let t0 = Instant::now(); - let mut batch_inputs: Vec = Vec::new(); - for (rel_path, symbols) in &file_symbols { - let abs_file = Path::new(root_dir).join(rel_path); - // Normalize to forward slashes so batch_resolved keys match Stage 6b lookups on Windows. - let abs_str = abs_file.to_str().unwrap_or("").replace('\\', "/"); - for imp in &symbols.imports { - batch_inputs.push(ImportResolutionInput { - from_file: abs_str.clone(), - import_source: imp.source.clone(), - }); - } - } - - let known_files: HashSet = collect_result - .files - .iter() - .map(|f| relative_path(root_dir, f)) - .collect(); - - let resolved = import_resolution::resolve_imports_batch( - &batch_inputs, - root_dir, - &napi_aliases, - Some(&known_files), - ); - - // Build batch_resolved map: "absFile|importSource" -> resolved path - let mut batch_resolved: HashMap = HashMap::new(); - for r in &resolved { - let key = format!("{}|{}", r.from_file, r.import_source); - batch_resolved.insert(key, r.resolved_path.clone()); - } + let (mut batch_resolved, known_files) = + resolve_pipeline_imports(&file_symbols, &collect_result.files, root_dir, &napi_aliases); timing.resolve_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 6b: Re-parse barrel candidates (incremental only) ───────── @@ -332,20 +547,7 @@ pub fn run_pipeline( // internal logic. We load nodes from DB and pass to the edge builder. build_and_insert_call_edges(conn, &file_symbols, &import_ctx, !change_result.is_full_build); - // Reconnect saved reverse-dep edges to new node IDs (#1012). Mirrors - // `reconnectReverseDepEdges` in build-edges.ts — for each saved edge, - // look up the new target node and recreate the edge with the original - // source_id (still valid; reverse-dep nodes were never purged). - if !saved_reverse_dep_edges.is_empty() { - let (reconnected, dropped) = - change_detection::reconnect_reverse_dep_edges(conn, &saved_reverse_dep_edges); - if dropped > 0 { - eprintln!( - "[codegraph] reconnect_reverse_dep_edges: {reconnected} reconnected, {dropped} dropped (target nodes not found)" - ); - } - } - + reconnect_saved_reverse_dep_edges(conn, &saved_reverse_dep_edges); timing.edges_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 8: Structure + roles ───────────────────────────────────── @@ -354,128 +556,41 @@ pub fn run_pipeline( // file_symbols only contains truly-changed files (reverse-deps are not // re-parsed; their edges are reconnected via save+reconnect — #1012), so // analysis_scope == changed_files. - let changed_files: Vec = file_symbols.keys().cloned().collect(); let analysis_scope: Option> = if change_result.is_full_build { None } else { - Some(changed_files.clone()) + Some(file_symbols.keys().cloned().collect()) }; - - let existing_file_count = structure::get_existing_file_count(conn); - let use_fast_path = - !change_result.is_full_build && parse_changes.len() <= FAST_PATH_MAX_CHANGED_FILES && existing_file_count > FAST_PATH_MIN_EXISTING_FILES; - - if use_fast_path { - structure::update_changed_file_metrics( - conn, - &changed_files, - &line_count_map, - &file_symbols, - ); - } else { - // Full structure: directory nodes, contains edges, file + directory metrics. - let changed_for_structure: Option> = if change_result.is_full_build { - None - } else { - Some(changed_files.clone()) - }; - structure::build_full_structure( - conn, - &file_symbols, - &collect_result.directories, - root_dir, - &line_count_map, - changed_for_structure.as_deref(), - ); - } + run_structure_phase( + conn, + &file_symbols, + &collect_result.directories, + root_dir, + &line_count_map, + parse_changes.len(), + change_result.is_full_build, + ); timing.structure_ms = t0.elapsed().as_secs_f64() * 1000.0; let t0 = Instant::now(); - // Role classification needs the truly-changed files plus reverse-deps of - // any removed files. `do_classify_incremental` expands to neighbours via - // the edges table, so reverse-deps of *changed* files are picked up - // automatically when their fan-in/fan-out is affected. Reverse-deps of - // *removed* files have to be added explicitly — the deleted file's nodes - // are gone, so neighbour expansion can't reach the importer. Without this - // seed, removal-only builds skip role classification entirely. (#1027) - let changed_file_list: Option> = if change_result.is_full_build { - None - } else { - let mut files = changed_files; - if !removal_reverse_deps.is_empty() { - let existing: HashSet = files.iter().cloned().collect(); - for f in removal_reverse_deps { - if !existing.contains(&f) { - files.push(f); - } - } - } - Some(files) - }; - if let Some(ref files) = changed_file_list { - if !files.is_empty() { - let _ = roles_db::do_classify_incremental(conn, files); - } - } else { - let _ = roles_db::do_classify_full(conn); - } + run_role_classification( + conn, + &file_symbols, + removal_reverse_deps, + change_result.is_full_build, + ); timing.roles_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 8b: Analysis persistence (AST, complexity, CFG, dataflow) ── - // Write analysis data from parsed file_symbols directly to DB tables, - // eliminating the JS runPostNativeAnalysis step and its WASM re-parse. - let include_complexity = opts.complexity.unwrap_or(true); - let include_cfg = opts.cfg.unwrap_or(true); - let do_analysis = include_ast || include_dataflow || include_cfg || include_complexity; - - let mut analysis_ok = true; - if do_analysis { - // Determine which files to analyze (excludes reverse-dep files) - let analysis_file_set: HashSet<&str> = match &analysis_scope { - Some(files) => files.iter().map(|s| s.as_str()).collect(), - None => file_symbols.keys().map(|s| s.as_str()).collect(), - }; - - // Build node ID lookup: (file, name, line) -> node_id - let node_id_map = build_analysis_node_map(conn, &analysis_file_set); - - // AST nodes - if include_ast { - let t0 = Instant::now(); - let ast_batches = build_ast_batches(&file_symbols, &analysis_file_set); - if ast_db::do_insert_ast_nodes(conn, &ast_batches).is_err() { - analysis_ok = false; - } - timing.ast_ms = t0.elapsed().as_secs_f64() * 1000.0; - } - - // Complexity metrics - if include_complexity { - let t0 = Instant::now(); - if !write_complexity(conn, &file_symbols, &analysis_file_set, &node_id_map) { - analysis_ok = false; - } - timing.complexity_ms = t0.elapsed().as_secs_f64() * 1000.0; - } - - // CFG blocks + edges - if include_cfg { - let t0 = Instant::now(); - if !write_cfg(conn, &file_symbols, &analysis_file_set, &node_id_map) { - analysis_ok = false; - } - timing.cfg_ms = t0.elapsed().as_secs_f64() * 1000.0; - } - - // Dataflow edges - if include_dataflow { - let t0 = Instant::now(); - if !write_dataflow(conn, &file_symbols, &analysis_file_set) { - analysis_ok = false; - } - timing.dataflow_ms = t0.elapsed().as_secs_f64() * 1000.0; - } - } + let (do_analysis, analysis_ok) = run_analysis_persistence( + conn, + &file_symbols, + analysis_scope.as_ref(), + &opts, + include_ast, + include_dataflow, + &mut timing, + ); // ── Stage 9: Finalize ────────────────────────────────────────────── let t0 = Instant::now(); @@ -971,24 +1086,73 @@ fn build_file_hash_entries( /// miss transitively-required nodes (e.g. a call site whose receiver type /// is declared in a file that isn't a direct import target). /// -/// Full builds always load every node — there is no smaller set anyway. -fn build_and_insert_call_edges( +/// Constant list of builtin JS receivers excluded from method-resolution +/// (callers of `console.log` etc. shouldn't get linked to a user-defined +/// `log` somewhere else). Mirrors `BUILTIN_RECEIVERS` in `build-edges.ts`. +fn builtin_call_receivers() -> Vec { + [ + "console", "Math", "JSON", "Object", "Array", "String", "Number", + "Boolean", "Date", "RegExp", "Map", "Set", "WeakMap", "WeakSet", + "Promise", "Symbol", "Error", "TypeError", "RangeError", "Proxy", + "Reflect", "Intl", "globalThis", "window", "document", "process", + "Buffer", "require", + ] + .into_iter() + .map(String::from) + .collect() +} + +const EDGE_NODE_KIND_FILTER: &str = "kind IN ('function','method','class','interface','struct','type','module','enum','trait','record','constant')"; + +/// For the scoped (incremental, small-batch) path of the edge builder, +/// compute the set of files that must be loaded: changed/reverse-dep files +/// plus their direct import targets plus barrel-only files plus the +/// ultimate definition files barrel chains resolve to. Mirrors the JS +/// `relevantFiles` accumulation in `loadNodes` (#976, greptile P1). +fn compute_edge_relevant_files( + file_symbols: &HashMap, + import_ctx: &crate::import_edges::ImportEdgeContext, +) -> HashSet { + let mut relevant_files: HashSet = file_symbols.keys().cloned().collect(); + for (rel_path, symbols) in file_symbols { + let abs_file = Path::new(&import_ctx.root_dir).join(rel_path); + let abs_str = abs_file.to_str().unwrap_or(""); + for imp in &symbols.imports { + let resolved = import_ctx.get_resolved(abs_str, &imp.source); + if resolved.is_empty() { + continue; + } + relevant_files.insert(resolved.clone()); + if import_ctx.is_barrel_file(&resolved) { + for name in &imp.names { + let clean_name = name.strip_prefix("* as ").unwrap_or(name); + let mut visited = HashSet::new(); + if let Some(ultimate) = + import_ctx.resolve_barrel_export(&resolved, clean_name, &mut visited) + { + relevant_files.insert(ultimate); + } + } + } + } + } + for barrel_path in &import_ctx.barrel_only_files { + relevant_files.insert(barrel_path.clone()); + } + relevant_files +} + +/// Load all candidate edge nodes either scoped via a temp _edge_files table +/// (incremental small-batch) or globally (full build). Returns a flat +/// `Vec` suitable for the native edge builder. +fn load_edge_node_set( conn: &Connection, file_symbols: &HashMap, - import_ctx: &ImportEdgeContext, + import_ctx: &crate::import_edges::ImportEdgeContext, is_incremental: bool, -) { - use crate::edge_builder::*; +) -> Vec { + use crate::edge_builder::NodeInfo; - let node_kind_filter = "kind IN ('function','method','class','interface','struct','type','module','enum','trait','record','constant')"; - - // Gate parity with `loadNodes` in `src/domain/graph/builder/stages/build-edges.ts`: - // isFullBuild = false - // && fileSymbols.size <= smallFilesThreshold (5) - // && existingFileCount > FAST_PATH_MIN_EXISTING_FILES (20) - // Small fixtures skip the scoped path entirely — the savings are - // negligible at that scale and the scoped set can miss nodes that the - // edge builder needs for receiver-type resolution (#976). let existing_file_count: i64 = conn .query_row( "SELECT COUNT(*) FROM nodes WHERE kind = 'file'", @@ -1000,174 +1164,156 @@ fn build_and_insert_call_edges( && file_symbols.len() <= crate::constants::FAST_PATH_MAX_CHANGED_FILES && existing_file_count > crate::constants::FAST_PATH_MIN_EXISTING_FILES; - let all_nodes: Vec = if scope_eligible { - // Build the scoped set: changed/reverse-dep files + their resolved - // import targets + any barrel files on the path + the **ultimate** - // source files that barrel chains resolve to. The FileEdgeInput - // construction below (see `imported_names` at ~L1035) rewrites - // `target_file` to the ultimate definition file via - // `resolve_barrel_export`; if that file isn't in `relevant_files` - // the edge builder's `nodes_by_name_and_file` lookup returns - // nothing and the call edge is silently dropped (greptile P1). - let mut relevant_files: HashSet = file_symbols.keys().cloned().collect(); - for (rel_path, symbols) in file_symbols { - let abs_file = Path::new(&import_ctx.root_dir).join(rel_path); - let abs_str = abs_file.to_str().unwrap_or(""); - for imp in &symbols.imports { - let resolved = import_ctx.get_resolved(abs_str, &imp.source); - if resolved.is_empty() { - continue; - } - relevant_files.insert(resolved.clone()); - // If the resolved target is a barrel, walk the re-export - // chain and add every ultimate definition file that a - // named import could resolve to. - if import_ctx.is_barrel_file(&resolved) { - for name in &imp.names { - let clean_name = name.strip_prefix("* as ").unwrap_or(name); - let mut visited = HashSet::new(); - if let Some(ultimate) = import_ctx.resolve_barrel_export( - &resolved, - clean_name, - &mut visited, - ) { - relevant_files.insert(ultimate); - } - } - } - } + if !scope_eligible { + return load_all_edge_nodes(conn); + } + + let relevant_files = compute_edge_relevant_files(file_symbols, import_ctx); + if relevant_files.is_empty() { + return Vec::new(); + } + + let _ = conn.execute_batch( + "CREATE TEMP TABLE IF NOT EXISTS _edge_files (file TEXT NOT NULL);\n CREATE INDEX IF NOT EXISTS _edge_files_file_idx ON _edge_files (file);", + ); + let _ = conn.execute("DELETE FROM temp._edge_files", []); + { + let mut ins = match conn.prepare("INSERT INTO temp._edge_files (file) VALUES (?1)") { + Ok(s) => s, + Err(_) => return Vec::new(), + }; + for f in &relevant_files { + let _ = ins.execute(rusqlite::params![f]); } - for barrel_path in &import_ctx.barrel_only_files { - relevant_files.insert(barrel_path.clone()); + } + + let sql = format!( + "SELECT n.id, n.name, n.kind, n.file, n.line FROM nodes n \ + INNER JOIN temp._edge_files ef ON n.file = ef.file \ + WHERE n.{EDGE_NODE_KIND_FILTER}", + ); + let nodes: Vec = match conn.prepare(&sql) { + Ok(mut stmt) => stmt + .query_map([], read_edge_node_info) + .map(|rows| rows.filter_map(|r| r.ok()).collect()) + .unwrap_or_default(), + Err(_) => Vec::new(), + }; + let _ = conn.execute("DROP TABLE IF EXISTS temp._edge_files", []); + nodes +} + +/// Load every candidate edge node from the DB (full-build path). +fn load_all_edge_nodes(conn: &Connection) -> Vec { + let sql = format!( + "SELECT id, name, kind, file, line FROM nodes WHERE {EDGE_NODE_KIND_FILTER}", + ); + match conn.prepare(&sql) { + Ok(mut stmt) => stmt + .query_map([], read_edge_node_info) + .map(|rows| rows.filter_map(|r| r.ok()).collect()) + .unwrap_or_default(), + Err(_) => Vec::new(), + } +} + +/// Row-mapper for the `SELECT id, name, kind, file, line FROM nodes ...` +/// shape used by both scoped and full edge-node loads. +fn read_edge_node_info(row: &rusqlite::Row) -> rusqlite::Result { + Ok(crate::edge_builder::NodeInfo { + id: row.get::<_, i64>(0)? as u32, + name: row.get(1)?, + kind: row.get(2)?, + file: row.get(3)?, + line: row.get::<_, i64>(4)? as u32, + }) +} + +/// Load all `file`-kind node IDs into a flat map (one query instead of one +/// per file). The `name = file` guard avoids accidentally overwriting the +/// map entry when an unrelated row happens to share the file path (#1028). +fn load_file_node_id_map(conn: &Connection) -> HashMap { + let mut map = HashMap::new(); + if let Ok(mut stmt) = conn.prepare( + "SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file", + ) { + if let Ok(rows) = + stmt.query_map([], |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32))) + { + for r in rows.flatten() { + map.insert(r.0, r.1); + } } + } + map +} - if relevant_files.is_empty() { - Vec::new() - } else { - // Schema qualification matches the existing `_analysis_files` - // pattern below: unqualified CREATE (temp schema is the - // default for TEMP tables), qualified `temp.` for every - // subsequent op. Index the file column so the INNER JOIN is - // a lookup rather than a table scan (greptile P2). - let _ = conn.execute_batch( - "CREATE TEMP TABLE IF NOT EXISTS _edge_files (file TEXT NOT NULL);\n CREATE INDEX IF NOT EXISTS _edge_files_file_idx ON _edge_files (file);", - ); - let _ = conn.execute("DELETE FROM temp._edge_files", []); - { - let mut ins = - match conn.prepare("INSERT INTO temp._edge_files (file) VALUES (?1)") { - Ok(s) => s, - Err(_) => return, - }; - for f in &relevant_files { - let _ = ins.execute(rusqlite::params![f]); +/// Resolve a file's imports to the list of `ImportedName` entries the edge +/// builder consumes. Walks barrel chains to the ultimate definition file so +/// the edge builder's name-lookup can find the right target (#976 P1). +fn collect_imported_names_for_file( + abs_str: &str, + symbols: &FileSymbols, + import_ctx: &crate::import_edges::ImportEdgeContext, +) -> Vec { + use crate::edge_builder::ImportedName; + let mut imported_names: Vec = Vec::new(); + for imp in &symbols.imports { + let resolved_path = import_ctx.get_resolved(abs_str, &imp.source); + for name in &imp.names { + let clean_name = name.strip_prefix("* as ").unwrap_or(name).to_string(); + let mut target_file = resolved_path.clone(); + if import_ctx.is_barrel_file(&resolved_path) { + let mut visited = HashSet::new(); + if let Some(actual) = + import_ctx.resolve_barrel_export(&resolved_path, &clean_name, &mut visited) + { + target_file = actual; } } - - let sql = format!( - "SELECT n.id, n.name, n.kind, n.file, n.line FROM nodes n \ - INNER JOIN temp._edge_files ef ON n.file = ef.file \ - WHERE n.{node_kind_filter}", - ); - let nodes: Vec = match conn.prepare(&sql) { - Ok(mut stmt) => stmt - .query_map([], |row| { - Ok(NodeInfo { - id: row.get::<_, i64>(0)? as u32, - name: row.get(1)?, - kind: row.get(2)?, - file: row.get(3)?, - line: row.get::<_, i64>(4)? as u32, - }) - }) - .map(|rows| rows.filter_map(|r| r.ok()).collect()) - .unwrap_or_default(), - Err(_) => Vec::new(), - }; - let _ = conn.execute("DROP TABLE IF EXISTS temp._edge_files", []); - nodes - } - } else { - let sql = format!("SELECT id, name, kind, file, line FROM nodes WHERE {node_kind_filter}"); - match conn.prepare(&sql) { - Ok(mut stmt) => stmt - .query_map([], |row| { - Ok(NodeInfo { - id: row.get::<_, i64>(0)? as u32, - name: row.get(1)?, - kind: row.get(2)?, - file: row.get(3)?, - line: row.get::<_, i64>(4)? as u32, - }) - }) - .map(|rows| rows.filter_map(|r| r.ok()).collect()) - .unwrap_or_default(), - Err(_) => Vec::new(), + imported_names.push(ImportedName { + name: clean_name, + file: target_file, + }); } - }; + } + imported_names +} + +/// Insert the edges produced by the native edge builder into the edges table. +fn insert_call_edge_rows(conn: &Connection, edges: &[crate::edge_builder::ComputedEdge]) { + if edges.is_empty() { + return; + } + let edge_rows: Vec = edges + .iter() + .map(|e| crate::edges_db::EdgeRow { + source_id: e.source_id, + target_id: e.target_id, + kind: e.kind.clone(), + confidence: e.confidence, + dynamic: e.dynamic, + }) + .collect(); + let _ = crate::edges_db::do_insert_edges(conn, &edge_rows); +} + +/// Full builds always load every node — there is no smaller set anyway. +fn build_and_insert_call_edges( + conn: &Connection, + file_symbols: &HashMap, + import_ctx: &ImportEdgeContext, + is_incremental: bool, +) { + use crate::edge_builder::*; + let all_nodes = load_edge_node_set(conn, file_symbols, import_ctx, is_incremental); if all_nodes.is_empty() { return; } - let builtin_receivers: Vec = vec![ - "console", - "Math", - "JSON", - "Object", - "Array", - "String", - "Number", - "Boolean", - "Date", - "RegExp", - "Map", - "Set", - "WeakMap", - "WeakSet", - "Promise", - "Symbol", - "Error", - "TypeError", - "RangeError", - "Proxy", - "Reflect", - "Intl", - "globalThis", - "window", - "document", - "process", - "Buffer", - "require", - ] - .into_iter() - .map(String::from) - .collect(); - - // Pre-load every file node ID into a HashMap with one query, replacing - // the per-file `query_row` cycle that paid a fresh sqlite3_prepare for - // each entry in `file_symbols` (#1013). - // - // The `name = file` predicate matches the legacy per-row lookup - // (`WHERE name = ? AND file = ?` with both binds set to `rel_path`). - // For file-kind nodes `name` and `file` are conventionally identical, - // but keeping the guard prevents an unrelated row from silently - // overwriting the map entry for `file` (#1028 review). - let file_node_ids: HashMap = { - let mut map = HashMap::new(); - if let Ok(mut stmt) = conn.prepare( - "SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file", - ) { - if let Ok(rows) = stmt.query_map([], |row| { - Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32)) - }) { - for r in rows.flatten() { - map.insert(r.0, r.1); - } - } - } - map - }; + let builtin_receivers = builtin_call_receivers(); + let file_node_ids = load_file_node_id_map(conn); // Build FileEdgeInput entries for the native edge builder let mut file_entries: Vec = Vec::new(); @@ -1175,35 +1321,14 @@ fn build_and_insert_call_edges( if import_ctx.barrel_only_files.contains(rel_path) { continue; } - let file_node_id: u32 = match file_node_ids.get(rel_path) { Some(&id) => id, None => continue, }; - // Build imported names from resolved imports - let mut imported_names: Vec = Vec::new(); let abs_file = Path::new(&import_ctx.root_dir).join(rel_path); let abs_str = abs_file.to_str().unwrap_or(""); - for imp in &symbols.imports { - let resolved_path = import_ctx.get_resolved(abs_str, &imp.source); - for name in &imp.names { - let clean_name = name.strip_prefix("* as ").unwrap_or(name).to_string(); - let mut target_file = resolved_path.clone(); - if import_ctx.is_barrel_file(&resolved_path) { - let mut visited = HashSet::new(); - if let Some(actual) = - import_ctx.resolve_barrel_export(&resolved_path, &clean_name, &mut visited) - { - target_file = actual; - } - } - imported_names.push(ImportedName { - name: clean_name, - file: target_file, - }); - } - } + let imported_names = collect_imported_names_for_file(abs_str, symbols, import_ctx); let type_map: Vec = symbols .type_map @@ -1217,7 +1342,7 @@ fn build_and_insert_call_edges( file_entries.push(FileEdgeInput { file: rel_path.clone(), - file_node_id: file_node_id, + file_node_id, definitions: symbols .definitions .iter() @@ -1252,23 +1377,8 @@ fn build_and_insert_call_edges( }); } - // Call the native edge builder let computed_edges = build_call_edges(file_entries, all_nodes, builtin_receivers); - - // Insert edges - if !computed_edges.is_empty() { - let edge_rows: Vec = computed_edges - .iter() - .map(|e| crate::edges_db::EdgeRow { - source_id: e.source_id, - target_id: e.target_id, - kind: e.kind.clone(), - confidence: e.confidence, - dynamic: e.dynamic, - }) - .collect(); - let _ = crate::edges_db::do_insert_edges(conn, &edge_rows); - } + insert_call_edge_rows(conn, &computed_edges); } // ── Analysis persistence helpers ───────────────────────────────────────── diff --git a/crates/codegraph-core/src/edge_builder.rs b/crates/codegraph-core/src/edge_builder.rs index 7fb3beac6..2d0499f73 100644 --- a/crates/codegraph-core/src/edge_builder.rs +++ b/crates/codegraph-core/src/edge_builder.rs @@ -549,120 +549,161 @@ pub fn build_import_edges( ); let mut edges = Vec::new(); - + let normalized_root = root_dir.replace('\\', "/"); for file_input in &files { - let abs_file = format!("{}/{}", root_dir.replace('\\', "/"), file_input.file); - + let abs_file = format!("{normalized_root}/{}", file_input.file); for imp in &file_input.imports { - // Barrel-only files: only emit reexport edges - if file_input.is_barrel_only && !imp.reexport { - continue; - } + process_single_import(&mut edges, file_input, imp, &abs_file, &ctx); + } + } + edges +} + +// ── build_import_edges helpers ────────────────────────────────────────── + +/// Strip a `"* as "` / `"*\tas "` prefix from an import name so the bare +/// symbol can be looked up against the target's exports. JS equivalent: +/// `name.replace(/^\*\s+as\s+/, '')`. +fn strip_star_as_prefix(name: &str) -> &str { + if name.starts_with("* as ") || name.starts_with("*\tas ") { + &name[5..] + } else { + name + } +} - // Look up resolved path - let resolve_key = format!("{}|{}", abs_file, imp.source); - let resolved_path = match ctx.resolved.get(resolve_key.as_str()) { - Some(p) => *p, - None => continue, - }; - - // Look up target file node ID - let target_node_id = match ctx.file_node_map.get(resolved_path) { - Some(id) => *id, - None => continue, - }; - - // Determine edge kind - let edge_kind = if imp.reexport { - "reexports" - } else if imp.type_only { - "imports-type" - } else if imp.dynamic_import { - "dynamic-imports" - } else { - "imports" - }; +/// Classify an import into its edge kind: reexports / imports-type / +/// dynamic-imports / imports. Mirrors the JS classifier in `build-edges.ts`. +fn classify_import_edge_kind(imp: &ImportInfo) -> &'static str { + if imp.reexport { + "reexports" + } else if imp.type_only { + "imports-type" + } else if imp.dynamic_import { + "dynamic-imports" + } else { + "imports" + } +} +/// For a `type` import targeting a barrel or resolved file, emit one +/// symbol-level `imports-type` edge per named symbol so the target symbols +/// receive fan-in credit and aren't misclassified as dead code. +fn emit_type_only_symbol_edges( + edges: &mut Vec, + file_input: &ImportEdgeFileInput, + imp: &ImportInfo, + resolved_path: &str, + ctx: &ImportEdgeContext, +) { + if !imp.type_only || ctx.symbol_node_map.is_empty() { + return; + } + for name in &imp.names { + let clean_name = strip_star_as_prefix(name); + let barrel_target = if ctx.barrel_set.contains(resolved_path) { + let mut visited = HashSet::new(); + barrel_resolution::resolve_barrel_export(ctx, resolved_path, clean_name, &mut visited) + } else { + None + }; + let sym_id = barrel_target + .as_deref() + .and_then(|f| ctx.symbol_node_map.get(&(clean_name, f))) + .or_else(|| ctx.symbol_node_map.get(&(clean_name, resolved_path))); + if let Some(&id) = sym_id { edges.push(ComputedEdge { source_id: file_input.file_node_id, - target_id: target_node_id, - kind: edge_kind.to_string(), + target_id: id, + kind: "imports-type".to_string(), confidence: 1.0, dynamic: 0, }); + } + } +} - // Type-only imports: create symbol-level edges so the target symbols - // get fan-in credit and aren't falsely classified as dead code. - if imp.type_only && !ctx.symbol_node_map.is_empty() { - for name in &imp.names { - let clean_name = if name.starts_with("* as ") || name.starts_with("*\tas ") { - &name[5..] - } else { - name.as_str() - }; - // Try barrel resolution first, then fall back to the resolved path - let barrel_target = if ctx.barrel_set.contains(resolved_path) { - let mut visited = HashSet::new(); - barrel_resolution::resolve_barrel_export(&ctx, resolved_path, clean_name, &mut visited) - } else { - None - }; - let sym_id = barrel_target - .as_deref() - .and_then(|f| ctx.symbol_node_map.get(&(clean_name, f))) - .or_else(|| ctx.symbol_node_map.get(&(clean_name, resolved_path))); - if let Some(&id) = sym_id { - edges.push(ComputedEdge { - source_id: file_input.file_node_id, - target_id: id, - kind: "imports-type".to_string(), - confidence: 1.0, - dynamic: 0, - }); - } - } - } - - // Barrel resolution: if not reexport and target is a barrel file - if !imp.reexport && ctx.barrel_set.contains(resolved_path) { - let mut resolved_sources: HashSet = HashSet::new(); - for name in &imp.names { - let clean_name = if name.starts_with("* as ") || name.starts_with("*\tas ") { - // Strip "* as " or "*\tas " prefix (both exactly 5 bytes) - // JS equivalent: name.replace(/^\*\s+as\s+/, '') - &name[5..] - } else { - name.as_str() - }; - - let mut visited = HashSet::new(); - let actual = barrel_resolution::resolve_barrel_export(&ctx, resolved_path, clean_name, &mut visited); - - if let Some(actual_source) = actual { - if actual_source != resolved_path && !resolved_sources.contains(&actual_source) { - if let Some(&actual_node_id) = ctx.file_node_map.get(actual_source.as_str()) { - let barrel_kind = match edge_kind { - "imports-type" => "imports-type", - "dynamic-imports" => "dynamic-imports", - _ => "imports", - }; - edges.push(ComputedEdge { - source_id: file_input.file_node_id, - target_id: actual_node_id, - kind: barrel_kind.to_string(), - confidence: 0.9, - dynamic: 0, - }); - } - resolved_sources.insert(actual_source); - } - } - } - } +/// For a non-reexport import targeting a barrel file, walk the barrel +/// chain for each named symbol and emit a barrel-through edge to the +/// ultimate definition file. Deduplicates target files via +/// `resolved_sources`. +fn emit_barrel_through_edges( + edges: &mut Vec, + file_input: &ImportEdgeFileInput, + imp: &ImportInfo, + resolved_path: &str, + edge_kind: &str, + ctx: &ImportEdgeContext, +) { + if imp.reexport || !ctx.barrel_set.contains(resolved_path) { + return; + } + let barrel_kind = match edge_kind { + "imports-type" => "imports-type", + "dynamic-imports" => "dynamic-imports", + _ => "imports", + }; + let mut resolved_sources: HashSet = HashSet::new(); + for name in &imp.names { + let clean_name = strip_star_as_prefix(name); + let mut visited = HashSet::new(); + let actual = barrel_resolution::resolve_barrel_export( + ctx, + resolved_path, + clean_name, + &mut visited, + ); + let actual_source = match actual { + Some(s) => s, + None => continue, + }; + if actual_source == resolved_path || resolved_sources.contains(&actual_source) { + continue; + } + if let Some(&actual_node_id) = ctx.file_node_map.get(actual_source.as_str()) { + edges.push(ComputedEdge { + source_id: file_input.file_node_id, + target_id: actual_node_id, + kind: barrel_kind.to_string(), + confidence: 0.9, + dynamic: 0, + }); } + resolved_sources.insert(actual_source); } +} - edges +/// Process a single import from a file, emitting the primary file-to-file +/// edge plus any type-symbol and barrel-through edges. +fn process_single_import( + edges: &mut Vec, + file_input: &ImportEdgeFileInput, + imp: &ImportInfo, + abs_file: &str, + ctx: &ImportEdgeContext, +) { + if file_input.is_barrel_only && !imp.reexport { + return; + } + let resolve_key = format!("{abs_file}|{}", imp.source); + let resolved_path = match ctx.resolved.get(resolve_key.as_str()) { + Some(p) => *p, + None => return, + }; + let target_node_id = match ctx.file_node_map.get(resolved_path) { + Some(id) => *id, + None => return, + }; + let edge_kind = classify_import_edge_kind(imp); + edges.push(ComputedEdge { + source_id: file_input.file_node_id, + target_id: target_node_id, + kind: edge_kind.to_string(), + confidence: 1.0, + dynamic: 0, + }); + emit_type_only_symbol_edges(edges, file_input, imp, resolved_path, ctx); + emit_barrel_through_edges(edges, file_input, imp, resolved_path, edge_kind, ctx); } #[cfg(test)] diff --git a/crates/codegraph-core/src/graph_algorithms.rs b/crates/codegraph-core/src/graph_algorithms.rs index a30c269ff..4d08a4d67 100644 --- a/crates/codegraph-core/src/graph_algorithms.rs +++ b/crates/codegraph-core/src/graph_algorithms.rs @@ -70,6 +70,58 @@ impl<'a> DirectedGraph<'a> { } } +// ─── Traversal helpers ─────────────────────────────────────────────── + +/// Pick the neighbor set used by `bfs_traversal` for the requested direction. +/// "backward" → predecessors, "both" → predecessors + successors, +/// anything else → successors. Mirrors the JS direction enum. +fn bfs_neighbors_for_direction<'a>( + graph: &'a DirectedGraph<'a>, + current: &str, + direction: &str, +) -> Vec<&'a str> { + match direction { + "backward" => graph + .predecessors + .get(current) + .map(|v| v.as_slice()) + .unwrap_or(&[]) + .to_vec(), + "both" => { + let mut all: Vec<&str> = Vec::new(); + if let Some(succ) = graph.successors.get(current) { + all.extend(succ.iter()); + } + if let Some(pred) = graph.predecessors.get(current) { + all.extend(pred.iter()); + } + all + } + _ => graph + .successors + .get(current) + .map(|v| v.as_slice()) + .unwrap_or(&[]) + .to_vec(), + } +} + +/// Walk the parent pointers produced by a BFS back from `terminal` to the +/// start node and return the path as a `Vec` (start → terminal). +fn reconstruct_bfs_path<'a>( + parent: &HashMap<&'a str, Option<&'a str>>, + terminal: &'a str, +) -> Vec { + let mut path: Vec = Vec::new(); + let mut node: Option<&str> = Some(terminal); + while let Some(n) = node { + path.push(n.to_string()); + node = parent.get(n).copied().flatten(); + } + path.reverse(); + path +} + // ─── BFS ───────────────────────────────────────────────────────────── /// BFS traversal on a directed graph built from edges. @@ -102,33 +154,7 @@ pub fn bfs_traversal( if depth >= max_depth { continue; } - - let neighbors: Vec<&str> = match dir { - "backward" => graph - .predecessors - .get(current) - .map(|v| v.as_slice()) - .unwrap_or(&[]) - .to_vec(), - "both" => { - let mut all: Vec<&str> = Vec::new(); - if let Some(succ) = graph.successors.get(current) { - all.extend(succ.iter()); - } - if let Some(pred) = graph.predecessors.get(current) { - all.extend(pred.iter()); - } - all - } - _ => graph - .successors - .get(current) - .map(|v| v.as_slice()) - .unwrap_or(&[]) - .to_vec(), - }; - - for n in neighbors { + for n in bfs_neighbors_for_direction(&graph, current, dir) { if !depths.contains_key(n) { depths.insert(n, depth + 1); queue.push_back(n); @@ -166,24 +192,19 @@ pub fn shortest_path(edges: Vec, from_id: String, to_id: String) -> V queue.push_back(from_id.as_str()); while let Some(current) = queue.pop_front() { - if let Some(neighbors) = graph.successors.get(current) { - for &neighbor in neighbors { - if parent.contains_key(neighbor) { - continue; - } - parent.insert(neighbor, Some(current)); - if neighbor == to_id.as_str() { - let mut path: Vec = Vec::new(); - let mut node: Option<&str> = Some(neighbor); - while let Some(n) = node { - path.push(n.to_string()); - node = parent.get(n).copied().flatten(); - } - path.reverse(); - return path; - } - queue.push_back(neighbor); + let neighbors = match graph.successors.get(current) { + Some(n) => n, + None => continue, + }; + for &neighbor in neighbors { + if parent.contains_key(neighbor) { + continue; + } + parent.insert(neighbor, Some(current)); + if neighbor == to_id.as_str() { + return reconstruct_bfs_path(&parent, neighbor); } + queue.push_back(neighbor); } } diff --git a/crates/codegraph-core/src/import_edges.rs b/crates/codegraph-core/src/import_edges.rs index 458476923..f000a808c 100644 --- a/crates/codegraph-core/src/import_edges.rs +++ b/crates/codegraph-core/src/import_edges.rs @@ -276,16 +276,144 @@ fn collect_type_only_lookup_pairs(ctx: &ImportEdgeContext) -> HashSet<(String, S /// - `reexports` for re-exports /// /// Also creates barrel-through edges (confidence 0.9) for imports targeting barrel files. +/// Classify an `ImportInfo` into the edge kind name used in the edges +/// table: reexports / imports-type / dynamic-imports / imports. +fn classify_import_kind(imp: &crate::types::Import) -> &'static str { + if imp.reexport.unwrap_or(false) { + "reexports" + } else if imp.type_only.unwrap_or(false) { + "imports-type" + } else if imp.dynamic_import.unwrap_or(false) { + "dynamic-imports" + } else { + "imports" + } +} + +/// For a `type` import, emit one symbol-level `imports-type` edge per name +/// so the target symbols receive fan-in credit and aren't classified dead. +fn emit_type_only_symbol_rows( + edges: &mut Vec, + file_node_id: i64, + imp: &crate::types::Import, + resolved_path: &str, + ctx: &ImportEdgeContext, + symbol_node_ids: &HashMap<(String, String), i64>, +) { + if !imp.type_only.unwrap_or(false) { + return; + } + for name in &imp.names { + let clean_name = name.strip_prefix("* as ").unwrap_or(name); + let mut target_file = resolved_path.to_string(); + if ctx.is_barrel_file(resolved_path) { + let mut visited = HashSet::new(); + if let Some(actual) = + ctx.resolve_barrel_export(resolved_path, clean_name, &mut visited) + { + target_file = actual; + } + } + if let Some(&sym_id) = symbol_node_ids.get(&(clean_name.to_string(), target_file)) { + edges.push(EdgeRow { + source_id: file_node_id, + target_id: sym_id, + kind: "imports-type".to_string(), + confidence: 1.0, + dynamic: 0, + }); + } + } +} + +/// For a non-reexport import targeting a barrel file, emit `imports`-like +/// edges to each ultimate definition file reached through the barrel chain. +fn emit_barrel_through_rows( + edges: &mut Vec, + file_node_id: i64, + imp: &crate::types::Import, + resolved_path: &str, + edge_kind: &str, + ctx: &ImportEdgeContext, + file_node_ids: &HashMap, +) { + let is_reexport = imp.reexport.unwrap_or(false); + if is_reexport || !ctx.is_barrel_file(resolved_path) { + return; + } + let through_kind = match edge_kind { + "imports-type" => "imports-type", + "dynamic-imports" => "dynamic-imports", + _ => "imports", + }; + let mut resolved_sources: HashSet = HashSet::new(); + for name in &imp.names { + let clean_name = name.strip_prefix("* as ").unwrap_or(name); + let mut visited = HashSet::new(); + let actual_source = + match ctx.resolve_barrel_export(resolved_path, clean_name, &mut visited) { + Some(s) => s, + None => continue, + }; + if actual_source == resolved_path || !resolved_sources.insert(actual_source.clone()) { + continue; + } + if let Some(&actual_id) = file_node_ids.get(&actual_source) { + edges.push(EdgeRow { + source_id: file_node_id, + target_id: actual_id, + kind: through_kind.to_string(), + confidence: 0.9, + dynamic: 0, + }); + } + } +} + +/// Emit all edges produced by a single import on a single source file. +fn emit_edges_for_import( + edges: &mut Vec, + file_node_id: i64, + abs_str: &str, + imp: &crate::types::Import, + is_barrel_only: bool, + ctx: &ImportEdgeContext, + file_node_ids: &HashMap, + symbol_node_ids: &HashMap<(String, String), i64>, +) { + let is_reexport = imp.reexport.unwrap_or(false); + if is_barrel_only && !is_reexport { + return; + } + let resolved_path = ctx.get_resolved(abs_str, &imp.source); + let target_id = match file_node_ids.get(&resolved_path) { + Some(&id) => id, + None => return, + }; + let edge_kind = classify_import_kind(imp); + edges.push(EdgeRow { + source_id: file_node_id, + target_id, + kind: edge_kind.to_string(), + confidence: 1.0, + dynamic: 0, + }); + emit_type_only_symbol_rows(edges, file_node_id, imp, &resolved_path, ctx, symbol_node_ids); + emit_barrel_through_rows( + edges, + file_node_id, + imp, + &resolved_path, + edge_kind, + ctx, + file_node_ids, + ); +} + pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec { let mut edges = Vec::new(); - // Pre-load all file node IDs once. Previously this was N x query_row, - // each of which ran a fresh sqlite3_prepare/step/finalize cycle (#1013). let file_node_ids = load_file_node_ids(conn); - // Only the symbols actually referenced by type-only imports are needed — - // skip the lookup entirely when no type-only imports exist (the common - // case), and otherwise issue a chunked `(name, file) IN (...)` query so - // memory stays bounded even on large monorepos (#1028 review). let needed_symbol_pairs = collect_type_only_lookup_pairs(ctx); let symbol_node_ids = if needed_symbol_pairs.is_empty() { HashMap::new() @@ -304,92 +432,16 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec id, - None => continue, - }; - - let edge_kind = if is_reexport { - "reexports" - } else if imp.type_only.unwrap_or(false) { - "imports-type" - } else if imp.dynamic_import.unwrap_or(false) { - "dynamic-imports" - } else { - "imports" - }; - - edges.push(EdgeRow { - source_id: file_node_id, - target_id, - kind: edge_kind.to_string(), - confidence: 1.0, - dynamic: 0, - }); - - // Type-only imports: create symbol-level edges so the target symbols - // get fan-in credit and aren't falsely classified as dead code. - if imp.type_only.unwrap_or(false) { - for name in &imp.names { - let clean_name = name.strip_prefix("* as ").unwrap_or(name); - let mut target_file = resolved_path.clone(); - if ctx.is_barrel_file(&resolved_path) { - let mut visited = HashSet::new(); - if let Some(actual) = ctx.resolve_barrel_export(&resolved_path, clean_name, &mut visited) { - target_file = actual; - } - } - if let Some(&sym_id) = - symbol_node_ids.get(&(clean_name.to_string(), target_file)) - { - edges.push(EdgeRow { - source_id: file_node_id, - target_id: sym_id, - kind: "imports-type".to_string(), - confidence: 1.0, - dynamic: 0, - }); - } - } - } - - // Build barrel-through edges if the target is a barrel file - if !is_reexport && ctx.is_barrel_file(&resolved_path) { - let mut resolved_sources = HashSet::new(); - for name in &imp.names { - let clean_name = name.strip_prefix("* as ").unwrap_or(name); - let mut visited = HashSet::new(); - if let Some(actual_source) = - ctx.resolve_barrel_export(&resolved_path, clean_name, &mut visited) - { - if actual_source != resolved_path - && resolved_sources.insert(actual_source.clone()) - { - if let Some(&actual_id) = file_node_ids.get(&actual_source) { - let through_kind = match edge_kind { - "imports-type" => "imports-type", - "dynamic-imports" => "dynamic-imports", - _ => "imports", - }; - edges.push(EdgeRow { - source_id: file_node_id, - target_id: actual_id, - kind: through_kind.to_string(), - confidence: 0.9, - dynamic: 0, - }); - } - } - } - } - } + emit_edges_for_import( + &mut edges, + file_node_id, + abs_str, + imp, + is_barrel_only, + ctx, + &file_node_ids, + &symbol_node_ids, + ); } } diff --git a/crates/codegraph-core/src/import_resolution.rs b/crates/codegraph-core/src/import_resolution.rs index 67d63137d..c701d9c4a 100644 --- a/crates/codegraph-core/src/import_resolution.rs +++ b/crates/codegraph-core/src/import_resolution.rs @@ -134,53 +134,59 @@ pub fn resolve_import_path( } /// Inner implementation with optional known_files cache. -fn resolve_import_path_inner( - from_file: &str, +/// Convert an absolute path candidate into a root-relative, normalized +/// path string. Used as the success exit of every probe in +/// `resolve_import_path_inner`. +fn relativize_to_root(candidate: &str, root_dir: &str) -> String { + let root = Path::new(root_dir); + if let Ok(rel) = Path::new(candidate).strip_prefix(root) { + normalize_path(&rel.display().to_string()) + } else { + normalize_path(candidate) + } +} + +/// Resolve a non-relative (alias or bare) import source. Returns the +/// resolved path or the raw source if no alias matches (bare specifier). +fn resolve_non_relative_import( import_source: &str, root_dir: &str, aliases: &PathAliases, known_files: Option<&HashSet>, ) -> String { - // Try alias resolution for non-relative imports - if !import_source.starts_with('.') { - if let Some(alias_resolved) = - resolve_via_alias(import_source, aliases, root_dir, known_files) - { - let root = Path::new(root_dir); - if let Ok(rel) = Path::new(&alias_resolved).strip_prefix(root) { - return normalize_path(&rel.display().to_string()); - } - return normalize_path(&alias_resolved); - } - // Bare specifier (e.g., "lodash") — return as-is - return import_source.to_string(); + if let Some(alias_resolved) = resolve_via_alias(import_source, aliases, root_dir, known_files) { + return relativize_to_root(&alias_resolved, root_dir); } + import_source.to_string() +} - // Relative import — normalize immediately to remove `.` / `..` segments - let dir = Path::new(from_file).parent().unwrap_or(Path::new("")); - let resolved = clean_path(&dir.join(import_source)); - let resolved_str = resolved.display().to_string().replace('\\', "/"); - - // .js → .ts remap - if resolved_str.ends_with(".js") { - let ts_candidate = resolved_str.replace(".js", ".ts"); - if file_exists(&ts_candidate, known_files, root_dir) { - let root = Path::new(root_dir); - if let Ok(rel) = Path::new(&ts_candidate).strip_prefix(root) { - return normalize_path(&rel.display().to_string()); - } - } - let tsx_candidate = resolved_str.replace(".js", ".tsx"); - if file_exists(&tsx_candidate, known_files, root_dir) { - let root = Path::new(root_dir); - if let Ok(rel) = Path::new(&tsx_candidate).strip_prefix(root) { - return normalize_path(&rel.display().to_string()); - } +/// Probe the `.js → .ts/.tsx` remap candidates and return the first +/// existing file's relative path, if any. +fn probe_js_to_ts_remap( + resolved_str: &str, + root_dir: &str, + known_files: Option<&HashSet>, +) -> Option { + if !resolved_str.ends_with(".js") { + return None; + } + for replacement in [".ts", ".tsx"] { + let candidate = resolved_str.replace(".js", replacement); + if file_exists(&candidate, known_files, root_dir) { + return Some(relativize_to_root(&candidate, root_dir)); } } + None +} - // Extension probing - let extensions = [ +/// Probe known extensions (TS/JS/Python plus index files) for an existing +/// match against the normalized relative path stem. +fn probe_known_extensions( + resolved_str: &str, + root_dir: &str, + known_files: Option<&HashSet>, +) -> Option { + const EXTENSIONS: &[&str] = &[ ".ts", ".tsx", ".js", @@ -193,31 +199,40 @@ fn resolve_import_path_inner( "/index.js", "/__init__.py", ]; - for ext in &extensions { - let candidate = format!("{}{}", resolved_str, ext); + for ext in EXTENSIONS { + let candidate = format!("{resolved_str}{ext}"); if file_exists(&candidate, known_files, root_dir) { - let root = Path::new(root_dir); - if let Ok(rel) = Path::new(&candidate).strip_prefix(root) { - return normalize_path(&rel.display().to_string()); - } + return Some(relativize_to_root(&candidate, root_dir)); } } + None +} - // Exact match - if file_exists(&resolved_str, known_files, root_dir) { - let root = Path::new(root_dir); - if let Ok(rel) = Path::new(&resolved_str).strip_prefix(root) { - return normalize_path(&rel.display().to_string()); - } +fn resolve_import_path_inner( + from_file: &str, + import_source: &str, + root_dir: &str, + aliases: &PathAliases, + known_files: Option<&HashSet>, +) -> String { + if !import_source.starts_with('.') { + return resolve_non_relative_import(import_source, root_dir, aliases, known_files); } - // Fallback: return relative path - let root = Path::new(root_dir); - if let Ok(rel) = resolved.strip_prefix(root) { - normalize_path(&rel.display().to_string()) - } else { - normalize_path(&resolved_str) + let dir = Path::new(from_file).parent().unwrap_or(Path::new("")); + let resolved = clean_path(&dir.join(import_source)); + let resolved_str = resolved.display().to_string().replace('\\', "/"); + + if let Some(p) = probe_js_to_ts_remap(&resolved_str, root_dir, known_files) { + return p; + } + if let Some(p) = probe_known_extensions(&resolved_str, root_dir, known_files) { + return p; + } + if file_exists(&resolved_str, known_files, root_dir) { + return relativize_to_root(&resolved_str, root_dir); } + relativize_to_root(&resolved.display().to_string().replace('\\', "/"), root_dir) } /// Compute proximity-based confidence for call resolution. diff --git a/crates/codegraph-core/src/read_queries.rs b/crates/codegraph-core/src/read_queries.rs index 405feacc2..11d03b2c1 100644 --- a/crates/codegraph-core/src/read_queries.rs +++ b/crates/codegraph-core/src/read_queries.rs @@ -112,6 +112,706 @@ const VALID_ROLES: &[&str] = &[ "dead-unresolved", ]; +// ── fn_deps internal types ────────────────────────────────────────────── + +/// Matched candidate node from the initial relevance ranking step of `fn_deps`. +struct FnDepsMatchedNode { + id: i32, + name: String, + kind: String, + file: String, + line: Option, + end_line: Option, + role: Option, + fan_in: i32, +} + +/// Caller node with id retained for BFS reuse. Differs from the public +/// `FnDepsCallerNode` which strips the id from the output. +struct FnDepsCallerWithId { + id: i32, + name: String, + kind: String, + file: String, + line: Option, + via_hierarchy: Option, +} + +// ── fn_deps helpers ───────────────────────────────────────────────────── + +/// Build the SQL + params for fn_deps' initial candidate-node lookup. +fn build_fn_deps_match_query( + name: &str, + kind: Option<&str>, + file: Option<&str>, +) -> (String, Vec>) { + let default_kinds: Vec = vec![ + "function".to_string(), + "method".to_string(), + "class".to_string(), + "constant".to_string(), + ]; + let kinds: Vec = match kind { + Some(k) => vec![k.to_string()], + None => default_kinds, + }; + + let mut sql = String::from( + "SELECT n.id, n.name, n.kind, n.file, n.line, n.end_line, n.role, \ + COALESCE(fi.cnt, 0) AS fan_in \ + FROM nodes n \ + LEFT JOIN (SELECT target_id, COUNT(*) AS cnt FROM edges WHERE kind = 'calls' GROUP BY target_id) fi \ + ON fi.target_id = n.id \ + WHERE n.name LIKE ?1", + ); + let mut params_v: Vec> = vec![Box::new(format!("%{name}%"))]; + let mut idx = 2; + + if !kinds.is_empty() { + let placeholders: Vec = kinds + .iter() + .enumerate() + .map(|(i, _)| format!("?{}", idx + i)) + .collect(); + sql.push_str(&format!(" AND n.kind IN ({})", placeholders.join(", "))); + for k in &kinds { + params_v.push(Box::new(k.clone())); + } + idx += kinds.len(); + } + if let Some(f) = file { + sql.push_str(&format!(" AND n.file LIKE ?{idx} ESCAPE '\\'")); + params_v.push(Box::new(format!("%{}%", escape_like(f)))); + } + + (sql, params_v) +} + +/// Score a matched node by relevance to the user query. Mirrors the JS +/// `findMatchingNodes` ranking in `domain/queries.ts`. +fn fn_deps_relevance_score(node: &FnDepsMatchedNode, lower_query: &str) -> f64 { + let lower_name = node.name.to_lowercase(); + let bare_name = lower_name.rsplit('.').next().unwrap_or(&lower_name); + let match_score = if lower_name == lower_query || bare_name == lower_query { + 100.0 + } else if lower_name.starts_with(lower_query) || bare_name.starts_with(lower_query) { + 60.0 + } else if lower_name.contains(&format!(".{lower_query}")) + || lower_name.contains(&format!("{lower_query}.")) + { + 40.0 + } else { + 10.0 + }; + let fan_in_bonus = ((node.fan_in as f64 + 1.0).log2() * 5.0).min(25.0); + match_score + fan_in_bonus +} + +/// Fetch the direct callees of a node (other nodes called by `node_id`). +fn fetch_fn_deps_callees( + conn: &rusqlite::Connection, + node_id: i32, + no_tests: bool, +) -> napi::Result> { + let mut stmt = conn + .prepare_cached( + "SELECT DISTINCT n.id, n.name, n.kind, n.file, n.line \ + FROM edges e JOIN nodes n ON e.target_id = n.id \ + WHERE e.source_id = ?1 AND e.kind = 'calls'", + ) + .map_err(|e| napi::Error::from_reason(format!("fn_deps callees prepare: {e}")))?; + let rows = stmt + .query_map(params![node_id], |row| { + Ok(FnDepsNode { + name: row.get("name")?, + kind: row.get("kind")?, + file: row.get("file")?, + line: row.get("line")?, + }) + }) + .map_err(|e| napi::Error::from_reason(format!("fn_deps callees: {e}")))?; + let mut out: Vec = rows + .collect::, _>>() + .map_err(|e| napi::Error::from_reason(format!("fn_deps callees collect: {e}")))?; + if no_tests { + out.retain(|c| !is_test_file(&c.file)); + } + Ok(out) +} + +/// Fetch the direct callers of a node. Retains `id` for BFS reuse. +fn fetch_fn_deps_direct_callers( + conn: &rusqlite::Connection, + node_id: i32, +) -> napi::Result> { + let mut stmt = conn + .prepare_cached( + "SELECT n.id, n.name, n.kind, n.file, n.line \ + FROM edges e JOIN nodes n ON e.source_id = n.id \ + WHERE e.target_id = ?1 AND e.kind = 'calls'", + ) + .map_err(|e| napi::Error::from_reason(format!("fn_deps callers prepare: {e}")))?; + let rows = stmt + .query_map(params![node_id], |row| { + Ok(FnDepsCallerWithId { + id: row.get("id")?, + name: row.get("name")?, + kind: row.get("kind")?, + file: row.get("file")?, + line: row.get("line")?, + via_hierarchy: None, + }) + }) + .map_err(|e| napi::Error::from_reason(format!("fn_deps callers: {e}")))?; + rows.collect::, _>>() + .map_err(|e| napi::Error::from_reason(format!("fn_deps callers collect: {e}"))) +} + +/// For a method node `Cls.foo`, expand callers via method-hierarchy resolution: +/// other classes that also define a method named `foo` and the callers of those +/// hierarchy peers. Appends to the supplied `callers` vector. Mirrors the JS +/// hierarchy expansion in `domain/queries.ts::findMethodHierarchyCallers`. +fn expand_method_hierarchy_callers( + conn: &rusqlite::Connection, + node: &FnDepsMatchedNode, + callers: &mut Vec, +) -> napi::Result<()> { + if node.kind != "method" || !node.name.contains('.') { + return Ok(()); + } + let method_name = match node.name.split('.').last() { + Some(n) => n, + None => return Ok(()), + }; + let pattern = format!("%.{method_name}"); + let related: Vec<(i32, String)> = { + let mut stmt = conn + .prepare_cached( + "SELECT n.id, n.name FROM nodes n \ + LEFT JOIN (SELECT target_id, COUNT(*) AS cnt FROM edges WHERE kind = 'calls' GROUP BY target_id) fi \ + ON fi.target_id = n.id \ + WHERE n.name LIKE ?1 AND n.kind = 'method'", + ) + .map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy prepare: {e}")))?; + let rows = stmt + .query_map(params![pattern], |row| { + Ok((row.get::<_, i32>("id")?, row.get::<_, String>("name")?)) + }) + .map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy: {e}")))?; + rows.collect::, _>>() + .map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy collect: {e}")))? + }; + for (rm_id, rm_name) in &related { + if *rm_id == node.id { + continue; + } + let mut stmt = conn + .prepare_cached( + "SELECT n.id, n.name, n.kind, n.file, n.line \ + FROM edges e JOIN nodes n ON e.source_id = n.id \ + WHERE e.target_id = ?1 AND e.kind = 'calls'", + ) + .map_err(|e| { + napi::Error::from_reason(format!("fn_deps hierarchy callers prepare: {e}")) + })?; + let rows = stmt + .query_map(params![rm_id], |row| { + Ok(FnDepsCallerWithId { + id: row.get("id")?, + name: row.get("name")?, + kind: row.get("kind")?, + file: row.get("file")?, + line: row.get("line")?, + via_hierarchy: Some(rm_name.clone()), + }) + }) + .map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy callers: {e}")))?; + let extra: Vec = rows.collect::, _>>().map_err(|e| { + napi::Error::from_reason(format!("fn_deps hierarchy callers collect: {e}")) + })?; + callers.extend(extra); + } + Ok(()) +} + +/// BFS over caller chains starting from `initial_frontier` up to `depth` +/// hops. Returns transitive caller groups, one per depth level. Mirrors the +/// JS `bfsTransitiveCallers` helper in `domain/queries.ts`. +fn bfs_transitive_callers( + conn: &rusqlite::Connection, + node_id: i32, + initial_frontier: Vec, + depth: usize, + no_tests: bool, +) -> napi::Result> { + if depth <= 1 { + return Ok(Vec::new()); + } + let mut visited: HashSet = HashSet::new(); + visited.insert(node_id); + let mut frontier: Vec = initial_frontier; + let mut groups: Vec = Vec::new(); + + for d in 2..=depth { + let unvisited: Vec<&FnDepsCallerWithId> = + frontier.iter().filter(|f| !visited.contains(&f.id)).collect(); + for f in &unvisited { + visited.insert(f.id); + } + if unvisited.is_empty() { + break; + } + let mut next_frontier: Vec = Vec::new(); + let mut next_ids: HashSet = HashSet::new(); + for f in &unvisited { + let mut stmt = conn + .prepare_cached( + "SELECT n.id, n.name, n.kind, n.file, n.line \ + FROM edges e JOIN nodes n ON e.source_id = n.id \ + WHERE e.target_id = ?1 AND e.kind = 'calls'", + ) + .map_err(|e| napi::Error::from_reason(format!("fn_deps bfs prepare: {e}")))?; + let rows = stmt + .query_map(params![f.id], |row| { + Ok(FnDepsCallerWithId { + id: row.get("id")?, + name: row.get("name")?, + kind: row.get("kind")?, + file: row.get("file")?, + line: row.get("line")?, + via_hierarchy: None, + }) + }) + .map_err(|e| napi::Error::from_reason(format!("fn_deps bfs: {e}")))?; + let upstream: Vec = rows + .collect::, _>>() + .map_err(|e| napi::Error::from_reason(format!("fn_deps bfs collect: {e}")))?; + for u in upstream { + if no_tests && is_test_file(&u.file) { + continue; + } + if !visited.contains(&u.id) && !next_ids.contains(&u.id) { + next_ids.insert(u.id); + next_frontier.push(u); + } + } + } + if !next_frontier.is_empty() { + groups.push(FnDepsTransitiveGroup { + depth: d as i32, + callers: next_frontier + .iter() + .map(|n| FnDepsNode { + name: n.name.clone(), + kind: n.kind.clone(), + file: n.file.clone(), + line: n.line, + }) + .collect(), + }); + } + frontier = next_frontier; + } + Ok(groups) +} + +/// Cached file-hash lookup: probes `file_hashes` for `file` and memoizes the +/// result in `cache` so repeated lookups in the same `fn_deps` call avoid +/// redundant prepared-statement execution. +fn fn_deps_cached_file_hash( + conn: &rusqlite::Connection, + cache: &mut HashMap>, + file: &str, +) -> Option { + if let Some(v) = cache.get(file) { + return v.clone(); + } + let hash: Option = conn + .prepare_cached("SELECT hash FROM file_hashes WHERE file = ?1") + .ok() + .and_then(|mut stmt| stmt.query_row(params![file], |row| row.get(0)).ok()); + cache.insert(file.to_string(), hash.clone()); + hash +} + +// ── get_graph_stats helpers ───────────────────────────────────────────── + +fn fetch_nodes_by_kind( + conn: &rusqlite::Connection, + no_tests_filter: &str, +) -> napi::Result> { + let sql = format!( + "SELECT kind, COUNT(*) as c FROM nodes WHERE 1=1 {no_tests_filter} GROUP BY kind", + ); + let mut stmt = conn + .prepare_cached(&sql) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats nodes_by_kind: {e}")))?; + let rows = stmt + .query_map([], |row| { + Ok(KindCount { + kind: row.get::<_, String>(0)?, + count: row.get::<_, i32>(1)?, + }) + }) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats nodes_by_kind query: {e}")) + })?; + rows.collect::, _>>().map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats nodes_by_kind collect: {e}")) + }) +} + +fn fetch_edges_by_kind( + conn: &rusqlite::Connection, + no_tests: bool, +) -> napi::Result> { + let sql = if no_tests { + format!( + "SELECT e.kind, COUNT(*) as c FROM edges e \ + JOIN nodes ns ON e.source_id = ns.id \ + JOIN nodes nt ON e.target_id = nt.id \ + WHERE 1=1 {} {} GROUP BY e.kind", + test_filter_clauses("ns.file"), + test_filter_clauses("nt.file"), + ) + } else { + "SELECT kind, COUNT(*) as c FROM edges GROUP BY kind".to_string() + }; + let mut stmt = conn + .prepare_cached(&sql) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats edges_by_kind: {e}")))?; + let rows = stmt + .query_map([], |row| { + Ok(KindCount { + kind: row.get::<_, String>(0)?, + count: row.get::<_, i32>(1)?, + }) + }) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats edges_by_kind query: {e}")) + })?; + rows.collect::, _>>().map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats edges_by_kind collect: {e}")) + }) +} + +fn fetch_role_counts( + conn: &rusqlite::Connection, + no_tests_filter: &str, +) -> napi::Result> { + let sql = format!( + "SELECT role, COUNT(*) as c FROM nodes WHERE role IS NOT NULL {no_tests_filter} GROUP BY role", + ); + let mut stmt = conn + .prepare_cached(&sql) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats role_counts: {e}")))?; + let rows = stmt + .query_map([], |row| { + Ok(RoleCount { + role: row.get::<_, String>(0)?, + count: row.get::<_, i32>(1)?, + }) + }) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats role_counts query: {e}")))?; + rows.collect::, _>>().map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats role_counts collect: {e}")) + }) +} + +fn fetch_quality_metrics( + conn: &rusqlite::Connection, + tf_file: &str, + tf_n_file: &str, +) -> napi::Result { + let callable_total: i32 = { + let sql = format!( + "SELECT COUNT(*) FROM nodes WHERE kind IN ('function', 'method') {tf_file}", + ); + conn.prepare_cached(&sql) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats callable_total: {e}")))? + .query_row([], |row| row.get(0)) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats callable_total query: {e}")) + })? + }; + let callable_with_callers: i32 = { + let sql = format!( + "SELECT COUNT(DISTINCT e.target_id) FROM edges e \ + JOIN nodes n ON e.target_id = n.id \ + WHERE e.kind = 'calls' AND n.kind IN ('function', 'method') {tf_n_file}", + ); + conn.prepare_cached(&sql) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats callable_with_callers: {e}")) + })? + .query_row([], |row| row.get(0)) + .map_err(|e| { + napi::Error::from_reason(format!( + "get_graph_stats callable_with_callers query: {e}" + )) + })? + }; + let call_edges: i32 = conn + .prepare_cached("SELECT COUNT(*) FROM edges WHERE kind = 'calls'") + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats call_edges: {e}")))? + .query_row([], |row| row.get(0)) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats call_edges query: {e}")) + })?; + let high_conf_call_edges: i32 = conn + .prepare_cached("SELECT COUNT(*) FROM edges WHERE kind = 'calls' AND confidence >= 0.7") + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats high_conf: {e}")))? + .query_row([], |row| row.get(0)) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats high_conf query: {e}")) + })?; + Ok(QualityMetrics { + callable_total, + callable_with_callers, + call_edges, + high_conf_call_edges, + }) +} + +fn fetch_file_hotspots( + conn: &rusqlite::Connection, + tf_n_file: &str, +) -> napi::Result> { + let sql = format!( + "SELECT n.file, \ + (SELECT COUNT(*) FROM edges WHERE target_id = n.id) as fan_in, \ + (SELECT COUNT(*) FROM edges WHERE source_id = n.id) as fan_out \ + FROM nodes n WHERE n.kind = 'file' {tf_n_file} \ + ORDER BY (SELECT COUNT(*) FROM edges WHERE target_id = n.id) \ + + (SELECT COUNT(*) FROM edges WHERE source_id = n.id) DESC \ + LIMIT 5", + ); + let mut stmt = conn + .prepare_cached(&sql) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats hotspots: {e}")))?; + let rows = stmt + .query_map([], |row| { + Ok(FileHotspot { + file: row.get(0)?, + fan_in: row.get(1)?, + fan_out: row.get(2)?, + }) + }) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats hotspots query: {e}")))?; + rows.collect::, _>>() + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats hotspots collect: {e}"))) +} + +fn fetch_complexity_summary( + conn: &rusqlite::Connection, + tf_n_file: &str, +) -> napi::Result> { + if !has_table(conn, "function_complexity") { + return Ok(None); + } + let sql = format!( + "SELECT fc.cognitive, fc.cyclomatic, fc.max_nesting, fc.maintainability_index \ + FROM function_complexity fc JOIN nodes n ON fc.node_id = n.id \ + WHERE n.kind IN ('function','method') {tf_n_file}", + ); + let mut stmt = conn + .prepare_cached(&sql) + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats complexity: {e}")))?; + let rows = stmt + .query_map([], |row| { + Ok(( + row.get::<_, i32>(0)?, + row.get::<_, i32>(1)?, + row.get::<_, i32>(2)?, + row.get::<_, f64>(3).unwrap_or(0.0), + )) + }) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats complexity query: {e}")) + })?; + let data: Vec<(i32, i32, i32, f64)> = rows.collect::, _>>().map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats complexity collect: {e}")) + })?; + if data.is_empty() { + return Ok(None); + } + let n = data.len() as f64; + let sum_cog: i32 = data.iter().map(|d| d.0).sum(); + let sum_cyc: i32 = data.iter().map(|d| d.1).sum(); + let max_cog = data.iter().map(|d| d.0).max().unwrap_or(0); + let max_cyc = data.iter().map(|d| d.1).max().unwrap_or(0); + let sum_mi: f64 = data.iter().map(|d| d.3).sum(); + let min_mi = data.iter().map(|d| d.3).fold(f64::INFINITY, f64::min); + Ok(Some(ComplexitySummary { + analyzed: data.len() as i32, + avg_cognitive: (sum_cog as f64 / n * 10.0).round() / 10.0, + avg_cyclomatic: (sum_cyc as f64 / n * 10.0).round() / 10.0, + max_cognitive: max_cog, + max_cyclomatic: max_cyc, + avg_mi: (sum_mi / n * 10.0).round() / 10.0, + min_mi: (min_mi * 10.0).round() / 10.0, + })) +} + +// ── find_nodes_for_triage helpers ─────────────────────────────────────── + +fn validate_triage_kind(kind: Option<&str>) -> napi::Result<()> { + if let Some(k) = kind { + if !EVERY_SYMBOL_KIND.contains(&k) { + return Err(napi::Error::from_reason(format!( + "Invalid kind: {k} (expected one of {})", + EVERY_SYMBOL_KIND.join(", ") + ))); + } + } + Ok(()) +} + +fn validate_triage_role(role: Option<&str>) -> napi::Result<()> { + if let Some(r) = role { + if !VALID_ROLES.contains(&r) { + return Err(napi::Error::from_reason(format!( + "Invalid role: {r} (expected one of {})", + VALID_ROLES.join(", ") + ))); + } + } + Ok(()) +} + +fn build_triage_query( + kind: Option<&str>, + role: Option<&str>, + file: Option<&str>, + no_tests: bool, +) -> (String, Vec>) { + let kinds_to_use: Vec<&str> = match kind { + Some(k) => vec![k], + None => vec!["function", "method", "class"], + }; + let kind_placeholders: Vec = kinds_to_use + .iter() + .enumerate() + .map(|(i, _)| format!("?{}", i + 1)) + .collect(); + + let mut sql = format!( + "SELECT n.id, n.name, n.kind, n.file, n.line, n.end_line, \ + n.parent_id, n.exported, n.qualified_name, n.scope, n.visibility, n.role, \ + COALESCE(fi.cnt, 0) AS fan_in, \ + COALESCE(fc.cognitive, 0) AS cognitive, \ + COALESCE(fc.maintainability_index, 0) AS mi, \ + COALESCE(fc.cyclomatic, 0) AS cyclomatic, \ + COALESCE(fc.max_nesting, 0) AS max_nesting, \ + COALESCE(fcc.commit_count, 0) AS churn \ + FROM nodes n \ + LEFT JOIN (SELECT target_id, COUNT(*) AS cnt FROM edges WHERE kind = 'calls' GROUP BY target_id) fi ON fi.target_id = n.id \ + LEFT JOIN function_complexity fc ON fc.node_id = n.id \ + LEFT JOIN file_commit_counts fcc ON n.file = fcc.file \ + WHERE n.kind IN ({kinds})", + kinds = kind_placeholders.join(", "), + ); + + let mut param_values: Vec> = Vec::new(); + for k in &kinds_to_use { + param_values.push(Box::new(k.to_string())); + } + let mut idx = kinds_to_use.len() + 1; + + if no_tests { + sql.push_str(&format!(" {}", test_filter_clauses("n.file"))); + } + if let Some(f) = file { + sql.push_str(&format!(" AND n.file LIKE ?{idx} ESCAPE '\\'")); + param_values.push(Box::new(format!("%{}%", escape_like(f)))); + idx += 1; + } + if let Some(r) = role { + if r == "dead" { + sql.push_str(&format!(" AND n.role LIKE ?{idx}")); + param_values.push(Box::new("dead%".to_string())); + } else { + sql.push_str(&format!(" AND n.role = ?{idx}")); + param_values.push(Box::new(r.to_string())); + } + } + sql.push_str(" ORDER BY n.file, n.line"); + (sql, param_values) +} + +fn read_triage_row(row: &rusqlite::Row) -> rusqlite::Result { + Ok(NativeTriageNodeRow { + id: row.get("id")?, + name: row.get("name")?, + kind: row.get("kind")?, + file: row.get("file")?, + line: row.get("line")?, + end_line: row.get("end_line")?, + parent_id: row.get("parent_id")?, + exported: row.get("exported")?, + qualified_name: row.get("qualified_name")?, + scope: row.get("scope")?, + visibility: row.get("visibility")?, + role: row.get("role")?, + fan_in: row.get("fan_in")?, + cognitive: row.get("cognitive")?, + mi: row.get("mi")?, + cyclomatic: row.get("cyclomatic")?, + max_nesting: row.get("max_nesting")?, + churn: row.get("churn")?, + }) +} + +fn fetch_embedding_info(conn: &rusqlite::Connection) -> napi::Result> { + if !has_table(conn, "embeddings") { + return Ok(None); + } + let count: i32 = conn + .prepare_cached("SELECT COUNT(*) FROM embeddings") + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats embeddings: {e}")))? + .query_row([], |row| row.get(0)) + .unwrap_or(0); + if count == 0 { + return Ok(None); + } + if !has_table(conn, "embedding_meta") { + return Ok(Some(EmbeddingInfo { + count, + model: None, + dim: None, + built_at: None, + })); + } + let mut model: Option = None; + let mut dim: Option = None; + let mut built_at: Option = None; + let mut stmt = conn + .prepare_cached("SELECT key, value FROM embedding_meta") + .map_err(|e| napi::Error::from_reason(format!("get_graph_stats embedding_meta: {e}")))?; + let rows = stmt + .query_map([], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) + }) + .map_err(|e| { + napi::Error::from_reason(format!("get_graph_stats embedding_meta query: {e}")) + })?; + for row in rows.flatten() { + let (k, v) = row; + match k.as_str() { + "model" => model = Some(v), + "dim" => dim = v.parse().ok(), + "built_at" => built_at = Some(v), + _ => {} + } + } + Ok(Some(EmbeddingInfo { + count, + model, + dim, + built_at, + })) +} + // ── Query Methods ─────────────────────────────────────────────────────── #[napi] @@ -456,116 +1156,27 @@ impl NativeDatabase { file: Option, no_tests: Option, ) -> napi::Result> { - // Validate kind - if let Some(ref k) = kind { - if !EVERY_SYMBOL_KIND.contains(&k.as_str()) { - return Err(napi::Error::from_reason(format!( - "Invalid kind: {k} (expected one of {})", - EVERY_SYMBOL_KIND.join(", ") - ))); - } - } - // Validate role - if let Some(ref r) = role { - if !VALID_ROLES.contains(&r.as_str()) { - return Err(napi::Error::from_reason(format!( - "Invalid role: {r} (expected one of {})", - VALID_ROLES.join(", ") - ))); - } - } + validate_triage_kind(kind.as_deref())?; + validate_triage_role(role.as_deref())?; let conn = self.conn()?; - - let kinds_to_use: Vec<&str> = match kind { - Some(ref k) => vec![k.as_str()], - None => vec!["function", "method", "class"], - }; - let kind_placeholders: Vec = kinds_to_use - .iter() - .enumerate() - .map(|(i, _)| format!("?{}", i + 1)) - .collect(); - - let mut sql = format!( - "SELECT n.id, n.name, n.kind, n.file, n.line, n.end_line, \ - n.parent_id, n.exported, n.qualified_name, n.scope, n.visibility, n.role, \ - COALESCE(fi.cnt, 0) AS fan_in, \ - COALESCE(fc.cognitive, 0) AS cognitive, \ - COALESCE(fc.maintainability_index, 0) AS mi, \ - COALESCE(fc.cyclomatic, 0) AS cyclomatic, \ - COALESCE(fc.max_nesting, 0) AS max_nesting, \ - COALESCE(fcc.commit_count, 0) AS churn \ - FROM nodes n \ - LEFT JOIN (SELECT target_id, COUNT(*) AS cnt FROM edges WHERE kind = 'calls' GROUP BY target_id) fi ON fi.target_id = n.id \ - LEFT JOIN function_complexity fc ON fc.node_id = n.id \ - LEFT JOIN file_commit_counts fcc ON n.file = fcc.file \ - WHERE n.kind IN ({kinds})", - kinds = kind_placeholders.join(", "), + let (sql, param_values) = build_triage_query( + kind.as_deref(), + role.as_deref(), + file.as_deref(), + no_tests.unwrap_or(false), ); - let mut param_values: Vec> = Vec::new(); - for k in &kinds_to_use { - param_values.push(Box::new(k.to_string())); - } - let mut idx = kinds_to_use.len() + 1; - - if no_tests.unwrap_or(false) { - sql.push_str(&format!(" {}", test_filter_clauses("n.file"))); - } - if let Some(ref f) = file { - sql.push_str(&format!(" AND n.file LIKE ?{idx} ESCAPE '\\'")); - param_values.push(Box::new(format!("%{}%", escape_like(f)))); - idx += 1; - } - if let Some(ref r) = role { - if r == "dead" { - sql.push_str(&format!(" AND n.role LIKE ?{idx}")); - param_values.push(Box::new("dead%".to_string())); - } else { - sql.push_str(&format!(" AND n.role = ?{idx}")); - param_values.push(Box::new(r.clone())); - } - } - sql.push_str(" ORDER BY n.file, n.line"); - - let mut stmt = conn - .prepare_cached(&sql) - .map_err(|e| { - napi::Error::from_reason(format!("find_nodes_for_triage prepare: {e}")) - })?; + let mut stmt = conn.prepare_cached(&sql).map_err(|e| { + napi::Error::from_reason(format!("find_nodes_for_triage prepare: {e}")) + })?; let params_ref: Vec<&dyn rusqlite::types::ToSql> = param_values.iter().map(|p| p.as_ref()).collect(); let rows = stmt - .query_map(params_ref.as_slice(), |row| { - Ok(NativeTriageNodeRow { - id: row.get("id")?, - name: row.get("name")?, - kind: row.get("kind")?, - file: row.get("file")?, - line: row.get("line")?, - end_line: row.get("end_line")?, - parent_id: row.get("parent_id")?, - exported: row.get("exported")?, - qualified_name: row.get("qualified_name")?, - scope: row.get("scope")?, - visibility: row.get("visibility")?, - role: row.get("role")?, - fan_in: row.get("fan_in")?, - cognitive: row.get("cognitive")?, - mi: row.get("mi")?, - cyclomatic: row.get("cyclomatic")?, - max_nesting: row.get("max_nesting")?, - churn: row.get("churn")?, - }) - }) - .map_err(|e| { - napi::Error::from_reason(format!("find_nodes_for_triage: {e}")) - })?; + .query_map(params_ref.as_slice(), read_triage_row) + .map_err(|e| napi::Error::from_reason(format!("find_nodes_for_triage: {e}")))?; rows.collect::, _>>() - .map_err(|e| { - napi::Error::from_reason(format!("find_nodes_for_triage collect: {e}")) - }) + .map_err(|e| napi::Error::from_reason(format!("find_nodes_for_triage collect: {e}"))) } /// List function/method/class nodes. @@ -1293,221 +1904,20 @@ impl NativeDatabase { #[napi] pub fn get_graph_stats(&self, no_tests: bool) -> napi::Result { let conn = self.conn()?; - let tf = if no_tests { - test_filter_clauses("file") - } else { - String::new() - }; - let tf_n = if no_tests { - test_filter_clauses("n.file") - } else { - String::new() - }; + let tf = if no_tests { test_filter_clauses("file") } else { String::new() }; + let tf_n = if no_tests { test_filter_clauses("n.file") } else { String::new() }; - // ── Node counts by kind ──────────────────────────────────── - let nodes_by_kind = { - let sql = format!( - "SELECT kind, COUNT(*) as c FROM nodes WHERE 1=1 {} GROUP BY kind", - tf - ); - let mut stmt = conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats nodes_by_kind: {e}")))?; - let rows = stmt.query_map([], |row| { - Ok(KindCount { - kind: row.get::<_, String>(0)?, - count: row.get::<_, i32>(1)?, - }) - }).map_err(|e| napi::Error::from_reason(format!("get_graph_stats nodes_by_kind query: {e}")))?; - rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats nodes_by_kind collect: {e}")))? - }; + let nodes_by_kind = fetch_nodes_by_kind(conn, &tf)?; let total_nodes: i32 = nodes_by_kind.iter().map(|k| k.count).sum(); - // ── Edge counts by kind ──────────────────────────────────── - let edges_by_kind = { - let sql = if no_tests { - format!( - "SELECT e.kind, COUNT(*) as c FROM edges e \ - JOIN nodes ns ON e.source_id = ns.id \ - JOIN nodes nt ON e.target_id = nt.id \ - WHERE 1=1 {} {} GROUP BY e.kind", - test_filter_clauses("ns.file"), - test_filter_clauses("nt.file"), - ) - } else { - "SELECT kind, COUNT(*) as c FROM edges GROUP BY kind".to_string() - }; - let mut stmt = conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats edges_by_kind: {e}")))?; - let rows = stmt.query_map([], |row| { - Ok(KindCount { - kind: row.get::<_, String>(0)?, - count: row.get::<_, i32>(1)?, - }) - }).map_err(|e| napi::Error::from_reason(format!("get_graph_stats edges_by_kind query: {e}")))?; - rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats edges_by_kind collect: {e}")))? - }; + let edges_by_kind = fetch_edges_by_kind(conn, no_tests)?; let total_edges: i32 = edges_by_kind.iter().map(|k| k.count).sum(); - // ── Role counts ──────────────────────────────────────────── - let role_counts = { - let sql = format!( - "SELECT role, COUNT(*) as c FROM nodes WHERE role IS NOT NULL {} GROUP BY role", - tf - ); - let mut stmt = conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats role_counts: {e}")))?; - let rows = stmt.query_map([], |row| { - Ok(RoleCount { - role: row.get::<_, String>(0)?, - count: row.get::<_, i32>(1)?, - }) - }).map_err(|e| napi::Error::from_reason(format!("get_graph_stats role_counts query: {e}")))?; - rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats role_counts collect: {e}")))? - }; - - // ── Quality metrics ──────────────────────────────────────── - let callable_total: i32 = { - let sql = format!( - "SELECT COUNT(*) FROM nodes WHERE kind IN ('function', 'method') {}", - tf - ); - conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats callable_total: {e}")))? - .query_row([], |row| row.get(0)) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats callable_total query: {e}")))? - }; - let callable_with_callers: i32 = { - let sql = format!( - "SELECT COUNT(DISTINCT e.target_id) FROM edges e \ - JOIN nodes n ON e.target_id = n.id \ - WHERE e.kind = 'calls' AND n.kind IN ('function', 'method') {}", - tf_n - ); - conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats callable_with_callers: {e}")))? - .query_row([], |row| row.get(0)) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats callable_with_callers query: {e}")))? - }; - let call_edges: i32 = conn - .prepare_cached("SELECT COUNT(*) FROM edges WHERE kind = 'calls'") - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats call_edges: {e}")))? - .query_row([], |row| row.get(0)) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats call_edges query: {e}")))?; - let high_conf_call_edges: i32 = conn - .prepare_cached("SELECT COUNT(*) FROM edges WHERE kind = 'calls' AND confidence >= 0.7") - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats high_conf: {e}")))? - .query_row([], |row| row.get(0)) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats high_conf query: {e}")))?; - - // ── Hotspots (top 5 files by coupling) ───────────────────── - let hotspots = { - let sql = format!( - "SELECT n.file, \ - (SELECT COUNT(*) FROM edges WHERE target_id = n.id) as fan_in, \ - (SELECT COUNT(*) FROM edges WHERE source_id = n.id) as fan_out \ - FROM nodes n WHERE n.kind = 'file' {} \ - ORDER BY (SELECT COUNT(*) FROM edges WHERE target_id = n.id) \ - + (SELECT COUNT(*) FROM edges WHERE source_id = n.id) DESC \ - LIMIT 5", - tf_n - ); - let mut stmt = conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats hotspots: {e}")))?; - let rows = stmt.query_map([], |row| { - Ok(FileHotspot { - file: row.get(0)?, - fan_in: row.get(1)?, - fan_out: row.get(2)?, - }) - }).map_err(|e| napi::Error::from_reason(format!("get_graph_stats hotspots query: {e}")))?; - rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats hotspots collect: {e}")))? - }; - - // ── Complexity summary ───────────────────────────────────── - let complexity = if has_table(conn, "function_complexity") { - let sql = format!( - "SELECT fc.cognitive, fc.cyclomatic, fc.max_nesting, fc.maintainability_index \ - FROM function_complexity fc JOIN nodes n ON fc.node_id = n.id \ - WHERE n.kind IN ('function','method') {}", - tf_n - ); - let mut stmt = conn.prepare_cached(&sql) - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats complexity: {e}")))?; - let rows = stmt.query_map([], |row| { - Ok(( - row.get::<_, i32>(0)?, - row.get::<_, i32>(1)?, - row.get::<_, i32>(2)?, - row.get::<_, f64>(3).unwrap_or(0.0), - )) - }).map_err(|e| napi::Error::from_reason(format!("get_graph_stats complexity query: {e}")))?; - let data: Vec<(i32, i32, i32, f64)> = rows - .collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats complexity collect: {e}")))?; - if data.is_empty() { - None - } else { - let n = data.len() as f64; - let sum_cog: i32 = data.iter().map(|d| d.0).sum(); - let sum_cyc: i32 = data.iter().map(|d| d.1).sum(); - let max_cog = data.iter().map(|d| d.0).max().unwrap_or(0); - let max_cyc = data.iter().map(|d| d.1).max().unwrap_or(0); - let sum_mi: f64 = data.iter().map(|d| d.3).sum(); - let min_mi = data.iter().map(|d| d.3).fold(f64::INFINITY, f64::min); - Some(ComplexitySummary { - analyzed: data.len() as i32, - avg_cognitive: (sum_cog as f64 / n * 10.0).round() / 10.0, - avg_cyclomatic: (sum_cyc as f64 / n * 10.0).round() / 10.0, - max_cognitive: max_cog, - max_cyclomatic: max_cyc, - avg_mi: (sum_mi / n * 10.0).round() / 10.0, - min_mi: (min_mi * 10.0).round() / 10.0, - }) - } - } else { - None - }; - - // ── Embeddings info ──────────────────────────────────────── - let embeddings = if has_table(conn, "embeddings") { - let count: i32 = conn - .prepare_cached("SELECT COUNT(*) FROM embeddings") - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats embeddings: {e}")))? - .query_row([], |row| row.get(0)) - .unwrap_or(0); - if count > 0 && has_table(conn, "embedding_meta") { - let mut model: Option = None; - let mut dim: Option = None; - let mut built_at: Option = None; - let mut stmt = conn - .prepare_cached("SELECT key, value FROM embedding_meta") - .map_err(|e| napi::Error::from_reason(format!("get_graph_stats embedding_meta: {e}")))?; - let rows = stmt.query_map([], |row| { - Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?)) - }).map_err(|e| napi::Error::from_reason(format!("get_graph_stats embedding_meta query: {e}")))?; - for row in rows { - if let Ok((k, v)) = row { - match k.as_str() { - "model" => model = Some(v), - "dim" => dim = v.parse().ok(), - "built_at" => built_at = Some(v), - _ => {} - } - } - } - Some(EmbeddingInfo { count, model, dim, built_at }) - } else if count > 0 { - Some(EmbeddingInfo { count, model: None, dim: None, built_at: None }) - } else { - None - } - } else { - None - }; + let role_counts = fetch_role_counts(conn, &tf)?; + let quality = fetch_quality_metrics(conn, &tf, &tf_n)?; + let hotspots = fetch_file_hotspots(conn, &tf_n)?; + let complexity = fetch_complexity_summary(conn, &tf_n)?; + let embeddings = fetch_embedding_info(conn)?; Ok(GraphStats { total_nodes, @@ -1515,12 +1925,7 @@ impl NativeDatabase { nodes_by_kind, edges_by_kind, role_counts, - quality: QualityMetrics { - callable_total, - callable_with_callers, - call_edges, - high_conf_call_edges, - }, + quality, hotspots, complexity, embeddings, @@ -1715,284 +2120,80 @@ impl NativeDatabase { let lower_query = name.to_lowercase(); // ── Step 1: Find matching nodes with fan-in (relevance ranking) ─── - let default_kinds = vec![ - "function".to_string(), - "method".to_string(), - "class".to_string(), - "constant".to_string(), - ]; - let kinds = if let Some(ref k) = kind { - vec![k.clone()] - } else { - default_kinds - }; - - let mut sql = String::from( - "SELECT n.id, n.name, n.kind, n.file, n.line, n.end_line, n.role, \ - COALESCE(fi.cnt, 0) AS fan_in \ - FROM nodes n \ - LEFT JOIN (SELECT target_id, COUNT(*) AS cnt FROM edges WHERE kind = 'calls' GROUP BY target_id) fi \ - ON fi.target_id = n.id \ - WHERE n.name LIKE ?1", - ); - let mut param_values: Vec> = - vec![Box::new(format!("%{name}%"))]; - let mut idx = 2; - - if !kinds.is_empty() { - let placeholders: Vec = - kinds.iter().enumerate().map(|(i, _)| format!("?{}", idx + i)).collect(); - sql.push_str(&format!(" AND n.kind IN ({})", placeholders.join(", "))); - for k in &kinds { - param_values.push(Box::new(k.clone())); - } - idx += kinds.len(); - } - if let Some(ref f) = file { - sql.push_str(&format!(" AND n.file LIKE ?{idx} ESCAPE '\\'")); - param_values.push(Box::new(format!("%{}%", escape_like(f)))); - } - + let (sql, param_values) = + build_fn_deps_match_query(&name, kind.as_deref(), file.as_deref()); let params_ref: Vec<&dyn rusqlite::types::ToSql> = param_values.iter().map(|p| p.as_ref()).collect(); - struct MatchedNode { - id: i32, - name: String, - kind: String, - file: String, - line: Option, - end_line: Option, - role: Option, - fan_in: i32, - } - - let mut matched: Vec = { - let mut stmt = conn.prepare_cached(&sql) + let mut matched: Vec = { + let mut stmt = conn + .prepare_cached(&sql) .map_err(|e| napi::Error::from_reason(format!("fn_deps find_nodes prepare: {e}")))?; - let rows = stmt.query_map(params_ref.as_slice(), |row| { - Ok(MatchedNode { - id: row.get("id")?, - name: row.get("name")?, - kind: row.get("kind")?, - file: row.get("file")?, - line: row.get("line")?, - end_line: row.get("end_line")?, - role: row.get("role")?, - fan_in: row.get("fan_in")?, + let rows = stmt + .query_map(params_ref.as_slice(), |row| { + Ok(FnDepsMatchedNode { + id: row.get("id")?, + name: row.get("name")?, + kind: row.get("kind")?, + file: row.get("file")?, + line: row.get("line")?, + end_line: row.get("end_line")?, + role: row.get("role")?, + fan_in: row.get("fan_in")?, + }) }) - }).map_err(|e| napi::Error::from_reason(format!("fn_deps find_nodes: {e}")))?; + .map_err(|e| napi::Error::from_reason(format!("fn_deps find_nodes: {e}")))?; rows.collect::, _>>() .map_err(|e| napi::Error::from_reason(format!("fn_deps find_nodes collect: {e}")))? }; - // Filter test files if no_tests { matched.retain(|n| !is_test_file(&n.file)); } - - // Relevance scoring (mirrors JS findMatchingNodes) matched.sort_by(|a, b| { - let score = |node: &MatchedNode| -> f64 { - let lower_name = node.name.to_lowercase(); - let bare_name = lower_name.rsplit('.').next().unwrap_or(&lower_name); - let match_score = if lower_name == lower_query || bare_name == lower_query { - 100.0 - } else if lower_name.starts_with(&lower_query) || bare_name.starts_with(&lower_query) { - 60.0 - } else if lower_name.contains(&format!(".{lower_query}")) || lower_name.contains(&format!("{lower_query}.")) { - 40.0 - } else { - 10.0 - }; - let fan_in_bonus = ((node.fan_in as f64 + 1.0).log2() * 5.0).min(25.0); - match_score + fan_in_bonus - }; - score(b).partial_cmp(&score(a)).unwrap_or(std::cmp::Ordering::Equal) + fn_deps_relevance_score(b, &lower_query) + .partial_cmp(&fn_deps_relevance_score(a, &lower_query)) + .unwrap_or(std::cmp::Ordering::Equal) }); // ── Step 2: Build result for each matched node ──────────────────── let mut file_hash_cache: HashMap> = HashMap::new(); - let mut results = Vec::with_capacity(matched.len()); - for node in &matched { - // Callees - let callees: Vec = { - let mut stmt = conn.prepare_cached( - "SELECT DISTINCT n.id, n.name, n.kind, n.file, n.line \ - FROM edges e JOIN nodes n ON e.target_id = n.id \ - WHERE e.source_id = ?1 AND e.kind = 'calls'" - ).map_err(|e| napi::Error::from_reason(format!("fn_deps callees prepare: {e}")))?; - let rows = stmt.query_map(params![node.id], |row| { - Ok(FnDepsNode { - name: row.get("name")?, - kind: row.get("kind")?, - file: row.get("file")?, - line: row.get("line")?, - }) - }).map_err(|e| napi::Error::from_reason(format!("fn_deps callees: {e}")))?; - let mut v: Vec = rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("fn_deps callees collect: {e}")))?; - if no_tests { - v.retain(|c| !is_test_file(&c.file)); - } - v - }; - // Callers (direct) — query includes `id` for BFS reuse - struct CallerWithId { id: i32, name: String, kind: String, file: String, line: Option, via_hierarchy: Option } - let mut callers_with_id: Vec = { - let mut stmt = conn.prepare_cached( - "SELECT n.id, n.name, n.kind, n.file, n.line \ - FROM edges e JOIN nodes n ON e.source_id = n.id \ - WHERE e.target_id = ?1 AND e.kind = 'calls'" - ).map_err(|e| napi::Error::from_reason(format!("fn_deps callers prepare: {e}")))?; - let rows = stmt.query_map(params![node.id], |row| { - Ok(CallerWithId { - id: row.get("id")?, - name: row.get("name")?, - kind: row.get("kind")?, - file: row.get("file")?, - line: row.get("line")?, - via_hierarchy: None, - }) - }).map_err(|e| napi::Error::from_reason(format!("fn_deps callers: {e}")))?; - rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("fn_deps callers collect: {e}")))? - }; - - // Method hierarchy resolution - if node.kind == "method" && node.name.contains('.') { - if let Some(method_name) = node.name.split('.').last() { - let pattern = format!("%.{method_name}"); - let related: Vec<(i32, String)> = { - let mut stmt = conn.prepare_cached( - "SELECT n.id, n.name FROM nodes n \ - LEFT JOIN (SELECT target_id, COUNT(*) AS cnt FROM edges WHERE kind = 'calls' GROUP BY target_id) fi \ - ON fi.target_id = n.id \ - WHERE n.name LIKE ?1 AND n.kind = 'method'" - ).map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy prepare: {e}")))?; - let rows = stmt.query_map(params![pattern], |row| { - Ok((row.get::<_, i32>("id")?, row.get::<_, String>("name")?)) - }).map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy: {e}")))?; - rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy collect: {e}")))? - }; - for (rm_id, rm_name) in &related { - if *rm_id == node.id { continue; } - let mut stmt = conn.prepare_cached( - "SELECT n.id, n.name, n.kind, n.file, n.line \ - FROM edges e JOIN nodes n ON e.source_id = n.id \ - WHERE e.target_id = ?1 AND e.kind = 'calls'" - ).map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy callers prepare: {e}")))?; - let rows = stmt.query_map(params![rm_id], |row| { - Ok(CallerWithId { - id: row.get("id")?, - name: row.get("name")?, - kind: row.get("kind")?, - file: row.get("file")?, - line: row.get("line")?, - via_hierarchy: Some(rm_name.clone()), - }) - }).map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy callers: {e}")))?; - let extra: Vec = rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("fn_deps hierarchy callers collect: {e}")))?; - callers_with_id.extend(extra); - } - } - } + for node in &matched { + let callees = fetch_fn_deps_callees(conn, node.id, no_tests)?; + let mut callers_with_id = fetch_fn_deps_direct_callers(conn, node.id)?; + expand_method_hierarchy_callers(conn, node, &mut callers_with_id)?; if no_tests { callers_with_id.retain(|c| !is_test_file(&c.file)); } - // Convert to FnDepsCallerNode for output (strip id) - let callers: Vec = callers_with_id.iter().map(|c| FnDepsCallerNode { - name: c.name.clone(), - kind: c.kind.clone(), - file: c.file.clone(), - line: c.line, - via_hierarchy: c.via_hierarchy.clone(), - }).collect(); - - // BFS transitive callers — reuse callers_with_id as initial frontier - let transitive_callers = if depth > 1 { - let mut visited = HashSet::new(); - visited.insert(node.id); - let initial_frontier: Vec = callers_with_id.iter().map(|c| CallerWithId { - id: c.id, name: c.name.clone(), kind: c.kind.clone(), file: c.file.clone(), line: c.line, via_hierarchy: c.via_hierarchy.clone(), - }).collect(); - let mut frontier: Vec = initial_frontier; - let mut groups: Vec = Vec::new(); - - for d in 2..=depth { - let unvisited: Vec<&CallerWithId> = frontier.iter() - .filter(|f| !visited.contains(&f.id)) - .collect(); - for f in &unvisited { - visited.insert(f.id); - } - if unvisited.is_empty() { break; } - - // Batch query: find all callers of the unvisited frontier - let mut next_frontier: Vec = Vec::new(); - let mut next_ids = HashSet::new(); - for f in &unvisited { - let mut stmt = conn.prepare_cached( - "SELECT n.id, n.name, n.kind, n.file, n.line \ - FROM edges e JOIN nodes n ON e.source_id = n.id \ - WHERE e.target_id = ?1 AND e.kind = 'calls'" - ).map_err(|e| napi::Error::from_reason(format!("fn_deps bfs prepare: {e}")))?; - let rows = stmt.query_map(params![f.id], |row| { - Ok(CallerWithId { - id: row.get("id")?, - name: row.get("name")?, - kind: row.get("kind")?, - file: row.get("file")?, - line: row.get("line")?, - via_hierarchy: None, - }) - }).map_err(|e| napi::Error::from_reason(format!("fn_deps bfs: {e}")))?; - let upstream: Vec = rows.collect::, _>>() - .map_err(|e| napi::Error::from_reason(format!("fn_deps bfs collect: {e}")))?; - for u in upstream { - if no_tests && is_test_file(&u.file) { continue; } - if !visited.contains(&u.id) && !next_ids.contains(&u.id) { - next_ids.insert(u.id); - next_frontier.push(u); - } - } - } - - if !next_frontier.is_empty() { - groups.push(FnDepsTransitiveGroup { - depth: d as i32, - callers: next_frontier.iter().map(|n| FnDepsNode { - name: n.name.clone(), - kind: n.kind.clone(), - file: n.file.clone(), - line: n.line, - }).collect(), - }); - } - frontier = next_frontier; - } - groups - } else { - Vec::new() - }; - - // File hash (cached) - let file_hash = if !file_hash_cache.contains_key(&node.file) { - let hash: Option = conn.prepare_cached( - "SELECT hash FROM file_hashes WHERE file = ?1" - ).ok().and_then(|mut stmt| { - stmt.query_row(params![node.file], |row| row.get(0)).ok() - }); - file_hash_cache.insert(node.file.clone(), hash.clone()); - hash - } else { - file_hash_cache.get(&node.file).cloned().flatten() - }; + let callers: Vec = callers_with_id + .iter() + .map(|c| FnDepsCallerNode { + name: c.name.clone(), + kind: c.kind.clone(), + file: c.file.clone(), + line: c.line, + via_hierarchy: c.via_hierarchy.clone(), + }) + .collect(); + + let initial_frontier: Vec = callers_with_id + .iter() + .map(|c| FnDepsCallerWithId { + id: c.id, + name: c.name.clone(), + kind: c.kind.clone(), + file: c.file.clone(), + line: c.line, + via_hierarchy: c.via_hierarchy.clone(), + }) + .collect(); + let transitive_callers = + bfs_transitive_callers(conn, node.id, initial_frontier, depth, no_tests)?; + + let file_hash = fn_deps_cached_file_hash(conn, &mut file_hash_cache, &node.file); results.push(FnDepsEntry { name: node.name.clone(), diff --git a/crates/codegraph-core/src/structure.rs b/crates/codegraph-core/src/structure.rs index ce5609640..b34307a8f 100644 --- a/crates/codegraph-core/src/structure.rs +++ b/crates/codegraph-core/src/structure.rs @@ -401,6 +401,111 @@ fn load_file_paths_in_dirs(conn: &Connection, dirs: &HashSet) -> Vec, + all_file_paths: &[String], + affected_dirs: Option<&HashSet>, +) { + let mut seen_files: HashSet = HashSet::new(); + let file_paths_iter = file_symbols + .keys() + .map(|s| s.as_str()) + .chain(all_file_paths.iter().map(|s| s.as_str())); + + for rel_path in file_paths_iter { + if !seen_files.insert(rel_path.to_string()) { + continue; + } + let dir = match parent_dir(rel_path) { + Some(d) => d, + None => continue, + }; + if let Some(ad) = affected_dirs { + if !ad.contains(&dir) { + continue; + } + } + let dir_id = match get_node_id(tx, &dir, "directory", &dir, 0) { + Some(id) => id, + None => continue, + }; + let file_id = match get_node_id(tx, rel_path, "file", rel_path, 0) { + Some(id) => id, + None => continue, + }; + let _ = stmt.execute(rusqlite::params![dir_id, file_id]); + } +} + +/// Emit `parent_dir → child_dir` contains edges for every entry in +/// `all_dirs` whose parent is in scope. +fn insert_dir_to_dir_contains_edges( + tx: &rusqlite::Transaction, + stmt: &mut rusqlite::Statement, + all_dirs: &HashSet, + affected_dirs: Option<&HashSet>, +) { + for dir in all_dirs { + let parent = match parent_dir(dir) { + Some(p) => p, + None => continue, + }; + if parent == *dir { + continue; + } + if let Some(ad) = affected_dirs { + if !ad.contains(&parent) { + continue; + } + } + let parent_id = match get_node_id(tx, &parent, "directory", &parent, 0) { + Some(id) => id, + None => continue, + }; + let child_id = match get_node_id(tx, dir, "directory", dir, 0) { + Some(id) => id, + None => continue, + }; + let _ = stmt.execute(rusqlite::params![parent_id, child_id]); + } +} + +/// Restore `parent → child` directory contains edges that were dropped by +/// cleanup for sibling subdirectories that aren't in `all_dirs` (no changed +/// file under them) but still exist in the DB. +fn restore_unchanged_dir_edges( + tx: &rusqlite::Transaction, + stmt: &mut rusqlite::Statement, + all_dirs: &HashSet, + affected_dirs: &HashSet, +) { + let db_child_dirs = load_child_dirs_in_affected(tx, affected_dirs); + for child_dir in &db_child_dirs { + if all_dirs.contains(child_dir.as_str()) { + continue; + } + let parent = match parent_dir(child_dir) { + Some(p) => p, + None => continue, + }; + if !affected_dirs.contains(&parent) { + continue; + } + if let (Some(p_id), Some(c_id)) = ( + get_node_id(tx, &parent, "directory", &parent, 0), + get_node_id(tx, child_dir, "directory", child_dir, 0), + ) { + let _ = stmt.execute(rusqlite::params![p_id, c_id]); + } + } +} + fn insert_contains_edges( conn: &Connection, file_symbols: &HashMap, @@ -422,96 +527,23 @@ fn insert_contains_edges( Err(_) => return, }; - // In incremental mode, we need ALL file paths in affected directories, - // not just the changed files in file_symbols. Load existing file nodes - // from the DB so unchanged files keep their dir→file containment edges. - let all_file_paths: Vec = if affected_dirs.is_some() { - load_file_paths_in_dirs(&tx, affected_dirs.as_ref().unwrap()) + let all_file_paths: Vec = if let Some(ref ad) = affected_dirs { + load_file_paths_in_dirs(&tx, ad) } else { Vec::new() }; - // Directory → file edges: iterate over file_symbols keys (covers - // changed/parsed files) plus DB-loaded paths (covers unchanged files - // in affected directories during incremental builds). - let mut seen_files: HashSet = HashSet::new(); - let file_paths_iter = file_symbols - .keys() - .map(|s| s.as_str()) - .chain(all_file_paths.iter().map(|s| s.as_str())); - - for rel_path in file_paths_iter { - if !seen_files.insert(rel_path.to_string()) { - continue; // deduplicate - } - let dir = match parent_dir(rel_path) { - Some(d) => d, - None => continue, - }; - // Skip unaffected directories in incremental mode - if let Some(ref ad) = affected_dirs { - if !ad.contains(&dir) { - continue; - } - } - let dir_id = match get_node_id(&tx, &dir, "directory", &dir, 0) { - Some(id) => id, - None => continue, - }; - let file_id = match get_node_id(&tx, rel_path, "file", rel_path, 0) { - Some(id) => id, - None => continue, - }; - let _ = stmt.execute(rusqlite::params![dir_id, file_id]); - } - - // Parent directory → child directory edges - for dir in all_dirs { - let parent = match parent_dir(dir) { - Some(p) => p, - None => continue, - }; - if parent == *dir { - continue; - } - if let Some(ref ad) = affected_dirs { - if !ad.contains(&parent) { - continue; - } - } - let parent_id = match get_node_id(&tx, &parent, "directory", &parent, 0) { - Some(id) => id, - None => continue, - }; - let child_id = match get_node_id(&tx, dir, "directory", dir, 0) { - Some(id) => id, - None => continue, - }; - let _ = stmt.execute(rusqlite::params![parent_id, child_id]); - } + insert_dir_to_file_contains_edges( + &tx, + &mut stmt, + file_symbols, + &all_file_paths, + affected_dirs.as_ref(), + ); + insert_dir_to_dir_contains_edges(&tx, &mut stmt, all_dirs, affected_dirs.as_ref()); - // Restore dir→dir edges for unchanged sibling subdirectories that - // were cleaned up but aren't in all_dirs (no changed file under them). if let Some(ref ad) = affected_dirs { - let db_child_dirs = load_child_dirs_in_affected(&tx, ad); - for child_dir in &db_child_dirs { - if all_dirs.contains(child_dir.as_str()) { - continue; // already handled above - } - let parent = match parent_dir(child_dir) { - Some(p) => p, - None => continue, - }; - if !ad.contains(&parent) { - continue; - } - if let (Some(p_id), Some(c_id)) = ( - get_node_id(&tx, &parent, "directory", &parent, 0), - get_node_id(&tx, child_dir, "directory", child_dir, 0), - ) { - let _ = stmt.execute(rusqlite::params![p_id, c_id]); - } - } + restore_unchanged_dir_edges(&tx, &mut stmt, all_dirs, ad); } } let _ = tx.commit(); @@ -646,97 +678,96 @@ fn compute_file_metrics( let _ = tx.commit(); } -fn compute_directory_metrics( - conn: &Connection, - file_symbols: &HashMap, - all_dirs: &HashSet, - import_edges: &[ImportEdge], -) { - // Load ALL file paths from DB so directory metrics account for unchanged - // files during incremental builds (file_symbols only has changed files). - let all_db_files: Vec = { - let mut v = Vec::new(); - if let Ok(mut stmt) = conn.prepare("SELECT name FROM nodes WHERE kind = 'file'") { - if let Ok(rows) = stmt.query_map([], |row| row.get::<_, String>(0)) { - for row in rows.flatten() { - v.push(row); - } +/// Load every file path stored as a `kind='file'` node in the DB. +fn load_all_file_paths_from_db(conn: &Connection) -> Vec { + let mut v = Vec::new(); + if let Ok(mut stmt) = conn.prepare("SELECT name FROM nodes WHERE kind = 'file'") { + if let Ok(rows) = stmt.query_map([], |row| row.get::<_, String>(0)) { + for row in rows.flatten() { + v.push(row); } } - v + } + v +} + +/// Walk a relative file path up through its ancestor directories, pushing +/// the file's path slice into each ancestor's bucket in `dir_files`. +fn record_file_in_ancestor_dirs<'a>( + rel_path: &'a str, + dir_files: &mut HashMap<&'a str, Vec<&'a str>>, +) { + let mut d = match parent_dir(rel_path) { + Some(p) => p, + None => return, }; + while !d.is_empty() && d != "." { + if let Some(files) = dir_files.get_mut(d.as_str()) { + files.push(rel_path); + } + d = match parent_dir(&d) { + Some(p) => p, + None => break, + }; + } +} - // Build dir→files map (transitive: each dir contains all files in all subdirs). - // Uses DB files as the complete set, supplemented by file_symbols for any - // files not yet in the DB (full build where nodes were just inserted). +/// Build the `dir → ancestor files` map. DB files are the authoritative set +/// for incremental builds; `file_symbols` adds anything newly-inserted that +/// hasn't yet shown up via the DB query (full-build first run). +fn build_dir_files_map<'a>( + all_dirs: &'a HashSet, + all_db_files: &'a [String], + file_symbols: &'a HashMap, +) -> HashMap<&'a str, Vec<&'a str>> { let mut dir_files: HashMap<&str, Vec<&str>> = HashMap::new(); for dir in all_dirs { dir_files.insert(dir.as_str(), Vec::new()); } let mut seen_files: HashSet<&str> = HashSet::new(); - // First: DB files (complete set for incremental builds) - for rel_path in &all_db_files { - if !seen_files.insert(rel_path.as_str()) { - continue; - } - let mut d = match parent_dir(rel_path) { - Some(p) => p, - None => continue, - }; - while !d.is_empty() && d != "." { - if let Some(files) = dir_files.get_mut(d.as_str()) { - files.push(rel_path.as_str()); - } - d = match parent_dir(&d) { - Some(p) => p, - None => break, - }; + for rel_path in all_db_files { + if seen_files.insert(rel_path.as_str()) { + record_file_in_ancestor_dirs(rel_path.as_str(), &mut dir_files); } } - // Second: file_symbols keys (covers newly-inserted files in full builds) for rel_path in file_symbols.keys() { - if !seen_files.insert(rel_path.as_str()) { - continue; - } - let mut d = match parent_dir(rel_path) { - Some(p) => p, - None => continue, - }; - while !d.is_empty() && d != "." { - if let Some(files) = dir_files.get_mut(d.as_str()) { - files.push(rel_path.as_str()); - } - d = match parent_dir(&d) { - Some(p) => p, - None => break, - }; + if seen_files.insert(rel_path.as_str()) { + record_file_in_ancestor_dirs(rel_path.as_str(), &mut dir_files); } } + dir_files +} - // Build reverse map: file → set of ancestor directories +/// Invert `dir_files` to a `file → ancestor dirs` map. +fn build_file_to_ancestor_dirs<'a>( + dir_files: &'a HashMap<&'a str, Vec<&'a str>>, +) -> HashMap<&'a str, HashSet<&'a str>> { let mut file_to_ancestor_dirs: HashMap<&str, HashSet<&str>> = HashMap::new(); - for (dir, files) in &dir_files { + for (dir, files) in dir_files { for f in files { - file_to_ancestor_dirs - .entry(f) - .or_default() - .insert(dir); + file_to_ancestor_dirs.entry(*f).or_default().insert(*dir); } } + file_to_ancestor_dirs +} - // Count intra-directory, fan-in, and fan-out edges per directory - let mut dir_edge_counts: HashMap<&str, (i64, i64, i64)> = HashMap::new(); // (intra, fan_in, fan_out) +/// Tally intra-directory, fan-in, and fan-out edge counts per directory by +/// classifying each import edge against the ancestor sets of its endpoints. +fn count_directory_edges<'a>( + all_dirs: &'a HashSet, + file_to_ancestor_dirs: &HashMap<&'a str, HashSet<&'a str>>, + import_edges: &[ImportEdge], +) -> HashMap<&'a str, (i64, i64, i64)> { + let mut dir_edge_counts: HashMap<&str, (i64, i64, i64)> = HashMap::new(); for dir in all_dirs { dir_edge_counts.insert(dir.as_str(), (0, 0, 0)); } for edge in import_edges { let src_dirs = file_to_ancestor_dirs.get(edge.source_file.as_str()); let tgt_dirs = file_to_ancestor_dirs.get(edge.target_file.as_str()); - if src_dirs.is_none() && tgt_dirs.is_none() { continue; } - if let Some(src_dirs) = src_dirs { for dir in src_dirs { if let Some(counts) = dir_edge_counts.get_mut(dir) { @@ -758,10 +789,11 @@ fn compute_directory_metrics( } } } + dir_edge_counts +} - // Count symbols per directory. - // Use DB counts (covers all files including unchanged ones in incremental - // builds) and fall back to file_symbols for newly-inserted files. +/// Load per-file symbol counts from the DB (one query per build). +fn load_db_symbol_counts(conn: &Connection) -> HashMap { let mut db_symbol_counts: HashMap = HashMap::new(); if let Ok(mut stmt) = conn.prepare( "SELECT file, COUNT(*) FROM nodes \ @@ -776,26 +808,53 @@ fn compute_directory_metrics( } } } + db_symbol_counts +} + +/// Count distinct definitions in `file_symbols` for a single newly-inserted +/// file (used as a fallback when DB counts haven't been written yet). +fn count_distinct_definitions(sym: &FileSymbols) -> i64 { + let mut seen = HashSet::new(); + let mut count: i64 = 0; + for d in &sym.definitions { + let key = format!("{}|{}|{}", d.name, d.kind, d.line); + if seen.insert(key) { + count += 1; + } + } + count +} + +/// Compute per-directory symbol counts by summing DB counts for every file +/// under the directory, falling back to in-memory `file_symbols` for any +/// files not yet persisted. +fn compute_dir_symbol_counts<'a>( + dir_files: &HashMap<&'a str, Vec<&'a str>>, + db_symbol_counts: &HashMap, + file_symbols: &HashMap, +) -> HashMap<&'a str, i64> { let mut dir_symbol_counts: HashMap<&str, i64> = HashMap::new(); - for (dir, files) in &dir_files { + for (dir, files) in dir_files { let mut count: i64 = 0; for f in files { if let Some(&c) = db_symbol_counts.get(*f) { count += c; } else if let Some(sym) = file_symbols.get(*f) { - let mut seen = HashSet::new(); - for d in &sym.definitions { - let key = format!("{}|{}|{}", d.name, d.kind, d.line); - if seen.insert(key) { - count += 1; - } - } + count += count_distinct_definitions(sym); } } - dir_symbol_counts.insert(dir, count); + dir_symbol_counts.insert(*dir, count); } + dir_symbol_counts +} - // Write directory metrics +/// Write the directory metrics rows produced by the previous helpers. +fn write_directory_metric_rows( + conn: &Connection, + dir_files: &HashMap<&str, Vec<&str>>, + dir_symbol_counts: &HashMap<&str, i64>, + dir_edge_counts: &HashMap<&str, (i64, i64, i64)>, +) { let tx = match conn.unchecked_transaction() { Ok(tx) => tx, Err(_) => return, @@ -809,13 +868,11 @@ fn compute_directory_metrics( Ok(s) => s, Err(_) => return, }; - - for (dir, files) in &dir_files { + for (dir, files) in dir_files { let dir_id = match get_node_id(&tx, dir, "directory", dir, 0) { Some(id) => id, None => continue, }; - let file_count = files.len() as i64; let symbol_count = dir_symbol_counts.get(dir).copied().unwrap_or(0); let (intra, fan_in, fan_out) = dir_edge_counts.get(dir).copied().unwrap_or((0, 0, 0)); @@ -825,7 +882,6 @@ fn compute_directory_metrics( } else { None }; - let _ = upsert.execute(rusqlite::params![ dir_id, symbol_count, @@ -839,6 +895,25 @@ fn compute_directory_metrics( let _ = tx.commit(); } +fn compute_directory_metrics( + conn: &Connection, + file_symbols: &HashMap, + all_dirs: &HashSet, + import_edges: &[ImportEdge], +) { + // Load ALL file paths from DB so directory metrics account for unchanged + // files during incremental builds (file_symbols only has changed files). + let all_db_files = load_all_file_paths_from_db(conn); + let dir_files = build_dir_files_map(all_dirs, &all_db_files, file_symbols); + let file_to_ancestor_dirs = build_file_to_ancestor_dirs(&dir_files); + let dir_edge_counts = + count_directory_edges(all_dirs, &file_to_ancestor_dirs, import_edges); + let db_symbol_counts = load_db_symbol_counts(conn); + let dir_symbol_counts = + compute_dir_symbol_counts(&dir_files, &db_symbol_counts, file_symbols); + write_directory_metric_rows(conn, &dir_files, &dir_symbol_counts, &dir_edge_counts); +} + #[cfg(test)] mod tests { use super::*; From d2eab30ebf0b5462f6842d45b41ae5a17a6de513 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:24:03 -0600 Subject: [PATCH 15/27] refactor(parser): extract LANGUAGE_REGISTRY iteration and worker boundary helpers --- src/domain/parser.ts | 178 ++++++++++----- src/domain/wasm-worker-entry.ts | 384 +++++++++++++++++++------------- 2 files changed, 354 insertions(+), 208 deletions(-) diff --git a/src/domain/parser.ts b/src/domain/parser.ts index bb53192c9..38ebc035a 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -322,12 +322,15 @@ export function getParser(parsers: Map, filePath: string) * without _tree", which was the source of #1036 — a single file missing one * analysis triggered a full-build re-parse of every WASM-parseable file. */ -export async function ensureWasmTrees( +/** + * Select files from `fileSymbols` that still need analysis data and are + * parseable by an installed WASM grammar. Pure (no I/O) — safe to unit-test. + */ +function collectBackfillPending( fileSymbols: Map, rootDir: string, needsFn?: (relPath: string, symbols: any) => boolean, -): Promise { - // Collect files that still need analysis data and are parseable by WASM. +): Array<{ relPath: string; absPath: string; symbols: any }> { const pending: Array<{ relPath: string; absPath: string; symbols: any }> = []; for (const [relPath, symbols] of fileSymbols) { if (symbols._tree) continue; // legacy path — leave existing trees alone @@ -335,6 +338,15 @@ export async function ensureWasmTrees( if (needsFn && !needsFn(relPath, symbols)) continue; pending.push({ relPath, absPath: path.join(rootDir, relPath), symbols }); } + return pending; +} + +export async function ensureWasmTrees( + fileSymbols: Map, + rootDir: string, + needsFn?: (relPath: string, symbols: any) => boolean, +): Promise { + const pending = collectBackfillPending(fileSymbols, rootDir, needsFn); if (pending.length === 0) return; const pool = getWasmWorkerPool(); @@ -352,30 +364,37 @@ export async function ensureWasmTrees( } } -/** - * Merge pre-computed analysis data from a worker result onto existing symbols. - * Only fills gaps — never overwrites fields the caller already populated. - * Used to patch native-parsed symbols with worker-produced astNodes / dataflow / - * per-definition complexity and cfg. - */ -function mergeAnalysisData(symbols: any, worker: ExtractorOutput): void { +/** Fill gap-only scalar metadata (`_langId`, `_lineCount`) from the worker output. */ +function mergeScalarMetadata(symbols: any, worker: ExtractorOutput): void { if (!symbols._langId && worker._langId) symbols._langId = worker._langId; if (!symbols._lineCount && worker._lineCount) symbols._lineCount = worker._lineCount; +} + +/** Fill gap-only analysis arrays (`astNodes`, `dataflow`) from the worker output. */ +function mergeAnalysisArrays(symbols: any, worker: ExtractorOutput): void { if (!Array.isArray(symbols.astNodes) && Array.isArray(worker.astNodes)) { symbols.astNodes = worker.astNodes; } if (!symbols.dataflow && worker.dataflow) symbols.dataflow = worker.dataflow; - if (worker.typeMap && worker.typeMap.size > 0) { - if (!symbols.typeMap || !(symbols.typeMap instanceof Map)) { - symbols.typeMap = new Map(worker.typeMap); - } else { - for (const [k, v] of worker.typeMap) { - if (!symbols.typeMap.has(k)) symbols.typeMap.set(k, v); - } - } +} + +/** Merge worker typeMap into existing symbols.typeMap with first-wins semantics. */ +function mergeTypeMap(symbols: any, worker: ExtractorOutput): void { + if (!worker.typeMap || worker.typeMap.size === 0) return; + if (!symbols.typeMap || !(symbols.typeMap instanceof Map)) { + symbols.typeMap = new Map(worker.typeMap); + return; + } + for (const [k, v] of worker.typeMap) { + if (!symbols.typeMap.has(k)) symbols.typeMap.set(k, v); } +} + +/** Patch existing definitions with worker complexity/cfg when absent. */ +function mergeDefinitionAnalysis(symbols: any, worker: ExtractorOutput): void { const existingDefs: any[] = Array.isArray(symbols.definitions) ? symbols.definitions : []; const workerDefs: any[] = Array.isArray(worker.definitions) ? worker.definitions : []; + if (existingDefs.length === 0 || workerDefs.length === 0) return; // Index existing defs by (kind, name, line) — mirrors engine.ts matching key. const byKey = new Map(); for (const d of existingDefs) byKey.set(`${d.kind}|${d.name}|${d.line}`, d); @@ -389,6 +408,19 @@ function mergeAnalysisData(symbols: any, worker: ExtractorOutput): void { } } +/** + * Merge pre-computed analysis data from a worker result onto existing symbols. + * Only fills gaps — never overwrites fields the caller already populated. + * Used to patch native-parsed symbols with worker-produced astNodes / dataflow / + * per-definition complexity and cfg. + */ +function mergeAnalysisData(symbols: any, worker: ExtractorOutput): void { + mergeScalarMetadata(symbols, worker); + mergeAnalysisArrays(symbols, worker); + mergeTypeMap(symbols, worker); + mergeDefinitionAnalysis(symbols, worker); +} + /** * Check whether the required WASM grammar files exist on disk. */ @@ -603,24 +635,36 @@ function patchDefinitions(definitions: any[]): void { } } +/** + * Field renames applied to each import record to bridge older native binaries + * that emit snake_case names. Each `[camel, snake]` pair becomes: + * `if (imp[camel] === undefined) imp[camel] = imp[snake];` + * Defined as data so the loop body stays trivially linear in cognitive complexity. + */ +const IMPORT_FIELD_RENAMES: ReadonlyArray = [ + ['typeOnly', 'type_only'], + ['wildcardReexport', 'wildcard_reexport'], + ['pythonImport', 'python_import'], + ['goImport', 'go_import'], + ['rustUse', 'rust_use'], + ['javaImport', 'java_import'], + ['csharpUsing', 'csharp_using'], + ['rubyRequire', 'ruby_require'], + ['phpUse', 'php_use'], + ['cInclude', 'c_include'], + ['kotlinImport', 'kotlin_import'], + ['swiftImport', 'swift_import'], + ['scalaImport', 'scala_import'], + ['bashSource', 'bash_source'], + ['dynamicImport', 'dynamic_import'], +]; + /** Patch import fields for backward compat with older native binaries. */ function patchImports(imports: any[]): void { for (const i of imports) { - if (i.typeOnly === undefined) i.typeOnly = i.type_only; - if (i.wildcardReexport === undefined) i.wildcardReexport = i.wildcard_reexport; - if (i.pythonImport === undefined) i.pythonImport = i.python_import; - if (i.goImport === undefined) i.goImport = i.go_import; - if (i.rustUse === undefined) i.rustUse = i.rust_use; - if (i.javaImport === undefined) i.javaImport = i.java_import; - if (i.csharpUsing === undefined) i.csharpUsing = i.csharp_using; - if (i.rubyRequire === undefined) i.rubyRequire = i.ruby_require; - if (i.phpUse === undefined) i.phpUse = i.php_use; - if (i.cInclude === undefined) i.cInclude = i.c_include; - if (i.kotlinImport === undefined) i.kotlinImport = i.kotlin_import; - if (i.swiftImport === undefined) i.swiftImport = i.swift_import; - if (i.scalaImport === undefined) i.scalaImport = i.scala_import; - if (i.bashSource === undefined) i.bashSource = i.bash_source; - if (i.dynamicImport === undefined) i.dynamicImport = i.dynamic_import; + for (const [camel, snake] of IMPORT_FIELD_RENAMES) { + if (i[camel] === undefined) i[camel] = i[snake]; + } } } @@ -1170,18 +1214,16 @@ export async function parseFilesWasmForBackfill( } /** - * Parse multiple files in bulk and return a Map. + * Run the native engine over `filePaths` and ingest the results into `result`. + * Returns the set of file paths the native engine successfully parsed and the + * TS/TSX files that need a typeMap backfill pass. */ -export async function parseFilesAuto( +function ingestNativeResults( + native: any, filePaths: string[], rootDir: string, - opts: ParseEngineOpts = {}, -): Promise> { - const { native } = resolveEngine(opts); - - if (!native) return parseFilesWasm(filePaths, rootDir); - - const result = new Map(); + result: Map, +): { nativeParsed: Set; needsTypeMap: { filePath: string; relPath: string }[] } { // Always extract all analysis data (dataflow + AST nodes) during native parse. // This eliminates the need for any downstream WASM re-parse or native standalone calls. const nativeResults = native.parseFilesFull @@ -1204,27 +1246,51 @@ export async function parseFilesAuto( needsTypeMap.push({ filePath: r.file, relPath }); } } - if (needsTypeMap.length > 0) { - await backfillTypeMapBatch(needsTypeMap, result); - } + return { nativeParsed, needsTypeMap }; +} - // Engine parity: native may silently drop files whose extensions are in - // SUPPORTED_EXTENSIONS (because a WASM grammar exists) but whose Rust - // extractor/grammar is missing or fails. WASM handles these — fall back so - // both engines process the same file set (#967). Restrict to installed WASM - // grammars so we don't warn about files that neither engine can parse. +/** + * Engine parity: native may silently drop files whose extensions are in + * SUPPORTED_EXTENSIONS (because a WASM grammar exists) but whose Rust + * extractor/grammar is missing or fails. WASM handles these — fall back so + * both engines process the same file set (#967). Restrict to installed WASM + * grammars so we don't warn about files that neither engine can parse. + */ +async function backfillNativeDrops( + filePaths: string[], + nativeParsed: Set, + rootDir: string, + result: Map, +): Promise { const installedExts = getInstalledWasmExtensions(); const dropped = filePaths.filter( (f) => !nativeParsed.has(f) && installedExts.has(path.extname(f).toLowerCase()), ); - if (dropped.length > 0) { - warn(`Native engine dropped ${dropped.length} file(s); falling back to WASM for parity`); - const wasmResults = await parseFilesWasmForBackfill(dropped, rootDir); - for (const [relPath, symbols] of wasmResults) { - result.set(relPath, symbols); - } + if (dropped.length === 0) return; + warn(`Native engine dropped ${dropped.length} file(s); falling back to WASM for parity`); + const wasmResults = await parseFilesWasmForBackfill(dropped, rootDir); + for (const [relPath, symbols] of wasmResults) { + result.set(relPath, symbols); } +} +/** + * Parse multiple files in bulk and return a Map. + */ +export async function parseFilesAuto( + filePaths: string[], + rootDir: string, + opts: ParseEngineOpts = {}, +): Promise> { + const { native } = resolveEngine(opts); + if (!native) return parseFilesWasm(filePaths, rootDir); + + const result = new Map(); + const { nativeParsed, needsTypeMap } = ingestNativeResults(native, filePaths, rootDir, result); + if (needsTypeMap.length > 0) { + await backfillTypeMapBatch(needsTypeMap, result); + } + await backfillNativeDrops(filePaths, nativeParsed, rootDir, result); return result; } diff --git a/src/domain/wasm-worker-entry.ts b/src/domain/wasm-worker-entry.ts index ca02bca70..b9298e879 100644 --- a/src/domain/wasm-worker-entry.ts +++ b/src/domain/wasm-worker-entry.ts @@ -573,6 +573,90 @@ interface SetupResult { dataflowVisitor: Visitor | null; } +/** + * Build the AST-store visitor for `langId`. Returns `null` when AST is + * disabled or the language has no AST type map. db-free — passes an empty + * nodeIdMap. The main thread re-resolves parent node IDs in + * `features/ast.ts::collectFileAstRows`. + */ +function buildAstVisitor( + langId: string, + defs: ExtractorOutput['definitions'], + relPath: string, + enabled: boolean, +): Visitor | null { + if (!enabled) return null; + const astTypeMap = AST_TYPE_MAPS.get(langId); + if (!astTypeMap) return null; + const stringConfig = AST_STRING_CONFIGS.get(langId); + return createAstStoreVisitor( + astTypeMap, + defs, + relPath, + new Map(), + stringConfig, + astStopRecurseKinds(langId), + ); +} + +/** + * Build the complexity visitor when enabled, the language has complexity + * rules, and at least one definition still lacks a `complexity` payload. + * Side-effect: extends `walkerOpts` with nesting-node types and a + * `getFunctionName` resolver suitable for this language. + */ +function buildComplexityVisitor( + langId: string, + defs: ExtractorOutput['definitions'], + enabled: boolean, + walkerOpts: WalkOptions, +): Visitor | null { + if (!enabled) return null; + const cRules = COMPLEXITY_RULES.get(langId); + if (!cRules || !defs.some((d) => hasFuncBody(d) && !d.complexity)) return null; + + const hRules = HALSTEAD_RULES.get(langId); + const visitor = createComplexityVisitor(cRules, hRules, { fileLevelWalk: true, langId }); + for (const t of cRules.nestingNodes) walkerOpts.nestingNodeTypes?.add(t); + const dfRules = DATAFLOW_RULES.get(langId); + walkerOpts.getFunctionName = (node: TreeSitterNode): string | null => { + const nameNode = node.childForFieldName('name'); + if (nameNode) return nameNode.text; + // dfRules shape varies per language; visitor-utils accepts any shape + if (dfRules) return getFuncName(node, dfRules as any); + return null; + }; + return visitor; +} + +/** Build the CFG visitor when enabled and at least one definition still lacks blocks. */ +function buildCfgVisitor( + langId: string, + defs: ExtractorOutput['definitions'], + enabled: boolean, +): Visitor | null { + if (!enabled) return null; + const cfgRulesForLang = CFG_RULES.get(langId); + if (!cfgRulesForLang) return null; + const needsCfg = defs.some( + (d) => hasFuncBody(d) && d.cfg !== null && !Array.isArray(d.cfg?.blocks), + ); + if (!needsCfg) return null; + return createCfgVisitor(cfgRulesForLang); +} + +/** Build the dataflow visitor when enabled and `symbols.dataflow` is not yet populated. */ +function buildDataflowVisitor( + langId: string, + symbols: ExtractorOutput, + enabled: boolean, +): Visitor | null { + if (!enabled) return null; + const dfRules = DATAFLOW_RULES.get(langId); + if (!dfRules || symbols.dataflow) return null; + return createDataflowVisitor(dfRules); +} + function setupVisitorsLocal( symbols: ExtractorOutput, relPath: string, @@ -580,82 +664,158 @@ function setupVisitorsLocal( opts: WorkerParseRequest['opts'], ): SetupResult { const defs = symbols.definitions || []; - const visitors: Visitor[] = []; const walkerOpts: WalkOptions = { functionNodeTypes: new Set(), nestingNodeTypes: new Set(), getFunctionName: (_node: TreeSitterNode) => null, }; - // AST-store: db-free — pass an empty nodeIdMap. The main thread re-resolves - // parent node IDs in features/ast.ts::collectFileAstRows. - let astVisitor: Visitor | null = null; - if (opts.ast) { - const astTypeMap = AST_TYPE_MAPS.get(langId); - if (astTypeMap) { - const stringConfig = AST_STRING_CONFIGS.get(langId); - astVisitor = createAstStoreVisitor( - astTypeMap, - defs, - relPath, - new Map(), - stringConfig, - astStopRecurseKinds(langId), - ); - visitors.push(astVisitor); - } - } + const astVisitor = buildAstVisitor(langId, defs, relPath, !!opts.ast); + const complexityVisitor = buildComplexityVisitor(langId, defs, !!opts.complexity, walkerOpts); + const cfgVisitor = buildCfgVisitor(langId, defs, !!opts.cfg); + const dataflowVisitor = buildDataflowVisitor(langId, symbols, !!opts.dataflow); - // Complexity - let complexityVisitor: Visitor | null = null; - if (opts.complexity) { - const cRules = COMPLEXITY_RULES.get(langId); - if (cRules && defs.some((d) => hasFuncBody(d) && !d.complexity)) { - const hRules = HALSTEAD_RULES.get(langId); - complexityVisitor = createComplexityVisitor(cRules, hRules, { - fileLevelWalk: true, - langId, - }); - for (const t of cRules.nestingNodes) walkerOpts.nestingNodeTypes?.add(t); - const dfRules = DATAFLOW_RULES.get(langId); - walkerOpts.getFunctionName = (node: TreeSitterNode): string | null => { - const nameNode = node.childForFieldName('name'); - if (nameNode) return nameNode.text; - // dfRules shape varies per language; visitor-utils accepts any shape - if (dfRules) return getFuncName(node, dfRules as any); - return null; - }; - visitors.push(complexityVisitor); - } - } + const visitors: Visitor[] = []; + if (astVisitor) visitors.push(astVisitor); + if (complexityVisitor) visitors.push(complexityVisitor); + if (cfgVisitor) visitors.push(cfgVisitor); + if (dataflowVisitor) visitors.push(dataflowVisitor); - // CFG - let cfgVisitor: Visitor | null = null; - if (opts.cfg) { - const cfgRulesForLang = CFG_RULES.get(langId); - if ( - cfgRulesForLang && - defs.some((d) => hasFuncBody(d) && d.cfg !== null && !Array.isArray(d.cfg?.blocks)) - ) { - cfgVisitor = createCfgVisitor(cfgRulesForLang); - visitors.push(cfgVisitor); - } + return { visitors, walkerOpts, astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor }; +} + +// ── Main parse handler ────────────────────────────────────────────────────── + +/** + * Run tree-sitter parse + extractor on `code`. Returns `null` when either + * step yields no usable output. Throws (for the caller to report back to the + * pool) only on a hard tree-sitter parse error. + */ +function parseAndExtract( + parser: Parser, + entry: LanguageRegistryEntry, + filePath: string, + code: string, +): { tree: Tree; symbols: ExtractorOutput } | null { + let tree: Tree | null; + try { + tree = parser.parse(code); + } catch (e: unknown) { + // Parse error — report back but keep worker alive. + throw new Error(`parse failed: ${(e as Error).message}`); } + if (!tree) return null; - // Dataflow - let dataflowVisitor: Visitor | null = null; - if (opts.dataflow) { - const dfRules = DATAFLOW_RULES.get(langId); - if (dfRules && !symbols.dataflow) { - dataflowVisitor = createDataflowVisitor(dfRules); - visitors.push(dataflowVisitor); - } + // Extractor — on failure, skip file (ok:true, null) to match parser.ts + // behavior where extractor issues don't crash the build. + let symbols: ExtractorOutput | null; + try { + const query = _queries.get(entry.id); + // tree-sitter's Tree/Query are structurally compatible with + // TreeSitterTree/TreeSitterQuery at runtime — same cast style as + // parser.ts::wasmExtractSymbols (parser.ts:789). + symbols = entry.extractor(tree as any, filePath, query as any) ?? null; + } catch { + return null; } + if (!symbols) { + return null; + } + return { tree, symbols }; +} - return { visitors, walkerOpts, astVisitor, complexityVisitor, cfgVisitor, dataflowVisitor }; +/** + * Project the visitor `ast-store` rows into the wire-safe shape returned to + * the main thread. Strips `file` and `parentNodeId` — both are re-resolved in + * `features/ast.ts::collectFileAstRows`. Always returns an array (even empty) + * so `engine.ts::fileNeedsWasmTree` doesn't treat the file as un-walked and + * trigger a full ensureWasmTrees re-parse (#1036). + */ +function projectAstNodes(results: WalkResults): SerializedExtractorOutput['astNodes'] { + const astRows = (results['ast-store'] || []) as Array<{ + line: number; + kind: string; + name: string | null | undefined; + text: string | null; + receiver: string | null; + file?: string; + parentNodeId?: number | null; + }>; + return astRows.map((n) => ({ + line: n.line, + kind: n.kind, + name: n.name ?? '', + text: n.text ?? undefined, + receiver: n.receiver ?? undefined, + })); } -// ── Main parse handler ────────────────────────────────────────────────────── +/** + * Run the configured visitor walk over `tree.rootNode` and apply each + * visitor's results back onto `symbols`. Returns the serialized astNodes + * (or `undefined` when AST is disabled / no rows produced). + * + * Mirrors engine.ts:791-829. Runs BEFORE `tree.delete()` because + * storeComplexityResults / storeCfgResults read `funcNode` off live nodes. + */ +function runVisitorWalk( + tree: Tree, + symbols: ExtractorOutput, + langId: string, + setup: SetupResult, +): SerializedExtractorOutput['astNodes'] { + if (setup.visitors.length === 0) return undefined; + // rootNode shape matches TreeSitterNode at runtime — same cast as parser.ts:789. + const results = walkWithVisitors(tree.rootNode as any, setup.visitors, langId, setup.walkerOpts); + const defs = symbols.definitions || []; + let serializedAstNodes: SerializedExtractorOutput['astNodes']; + if (setup.astVisitor) serializedAstNodes = projectAstNodes(results); + if (setup.complexityVisitor) storeComplexityResults(results, defs, langId); + if (setup.cfgVisitor) storeCfgResults(results, defs); + if (setup.dataflowVisitor) symbols.dataflow = results.dataflow as DataflowResult; + return serializedAstNodes; +} + +/** + * Pack the in-memory ExtractorOutput into the structured-clone-safe shape + * sent back across the worker boundary. Converts the typeMap into a tuple + * array and intentionally omits `_tree` (cannot cross the boundary). + */ +function serializeExtractorOutput( + symbols: ExtractorOutput, + langId: LanguageId, + code: string, + astNodes: SerializedExtractorOutput['astNodes'], +): SerializedExtractorOutput { + return { + definitions: symbols.definitions, + calls: symbols.calls, + imports: symbols.imports, + classes: symbols.classes, + exports: symbols.exports, + typeMap: Array.from(symbols.typeMap.entries()), + _langId: langId, + _lineCount: code.split('\n').length, + dataflow: symbols.dataflow, + astNodes, + }; +} + +/** + * Release WASM linear memory backing a tree. Best-effort — swallows errors so + * the worker keeps serving requests. Deferring this would let trees accumulate + * in the worker's WASM heap and defeat the point of isolating parse calls. + */ +function disposeTree(tree: Tree | null): void { + if (!tree) return; + const deletable = tree as unknown as { delete?: () => void }; + if (typeof deletable.delete !== 'function') return; + try { + deletable.delete(); + } catch { + // best-effort cleanup — swallow; worker continues. + } +} async function handleParse(msg: WorkerParseRequest): Promise { const ext = path.extname(msg.filePath).toLowerCase(); @@ -666,100 +826,20 @@ async function handleParse(msg: WorkerParseRequest): Promise 0) { - // rootNode shape matches TreeSitterNode at runtime — same cast as parser.ts:789. - const results = walkWithVisitors(tree.rootNode as any, visitors, entry.id, walkerOpts); - - const defs = symbols.definitions || []; - if (astVisitor) { - const astRows = (results['ast-store'] || []) as Array<{ - line: number; - kind: string; - name: string | null | undefined; - text: string | null; - receiver: string | null; - file?: string; - parentNodeId?: number | null; - }>; - // Always set an array (even empty) — leaving astNodes undefined makes - // engine.ts::fileNeedsWasmTree treat the file as un-walked and trigger - // a full ensureWasmTrees re-parse of every WASM-parseable file (#1036). - // Strip `file` and `parentNodeId` — main thread re-resolves both in - // features/ast.ts::collectFileAstRows. - serializedAstNodes = astRows.map((n) => ({ - line: n.line, - kind: n.kind, - name: n.name ?? '', - text: n.text ?? undefined, - receiver: n.receiver ?? undefined, - })); - } - - if (complexityVisitor) storeComplexityResults(results, defs, entry.id); - if (cfgVisitor) storeCfgResults(results, defs); - if (dataflowVisitor) symbols.dataflow = results.dataflow as DataflowResult; - } - - // Serialize — convert Map to tuple array for the wire. - const serialized: SerializedExtractorOutput = { - definitions: symbols.definitions, - calls: symbols.calls, - imports: symbols.imports, - classes: symbols.classes, - exports: symbols.exports, - typeMap: Array.from(symbols.typeMap.entries()), - _langId: entry.id as LanguageId, - _lineCount: msg.code.split('\n').length, - dataflow: symbols.dataflow, - astNodes: serializedAstNodes, - }; - // _tree is deliberately not serialized — it cannot cross the worker boundary. - return serialized; + const serializedAstNodes = runVisitorWalk(tree, symbols, entry.id, setup); + return serializeExtractorOutput(symbols, entry.id as LanguageId, msg.code, serializedAstNodes); } finally { - // ALWAYS release WASM memory before responding. Deferring this would let - // trees accumulate in the worker's WASM heap across requests and defeat - // the point of isolating parse calls. - if (tree && typeof (tree as unknown as { delete?: () => void }).delete === 'function') { - try { - (tree as unknown as { delete: () => void }).delete(); - } catch { - // best-effort cleanup — swallow; worker continues. - } - } + // ALWAYS release WASM memory before responding (see disposeTree note). + disposeTree(tree); } } From 6819cd6a3c3491d724d570ce2ff5fa1ddbfbf12b Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:31:52 -0600 Subject: [PATCH 16/27] refactor(analysis): decompose module-map and reduce complexity in fn-impact and dependencies Split high-cognitive-complexity functions in the analysis domain into focused helpers. Worst functions per gauntlet (cog/cyc/maxNesting/halstead) are now below thresholds. module-map.ts (statsData cog=31 -> below threshold): - Extract buildStatsFromNative and buildStatsFromJs branches - Share false-positive query and quality-score helpers between paths - aggregateRolesFromNative pulls duplicated role-aggregation code out fn-impact.ts (bfsTransitiveCallers cog=37 -> below threshold, impactAnalysisData cog=27 -> below threshold): - Extract recordCaller, processFrontierNode, seedInterfaceImplementors - Extract bfsImportDependents and groupDependentsByLevel dependencies.ts (bfsShortestPath cog=29, bfsFilePath cog=30, buildTransitiveCallers cog=24 -> all below threshold): - Extract buildNextCallerFrontier from buildTransitiveCallers - Extract buildNeighborStmt + visitNeighbor; state collected in struct - Extract visitFileNeighbor + reconstructFilePath docs check acknowledged - internal helper extraction, no user-facing changes --- src/domain/analysis/dependencies.ts | 251 ++++++++++++++-------- src/domain/analysis/fn-impact.ts | 170 ++++++++++----- src/domain/analysis/module-map.ts | 315 +++++++++++++++------------- 3 files changed, 461 insertions(+), 275 deletions(-) diff --git a/src/domain/analysis/dependencies.ts b/src/domain/analysis/dependencies.ts index 1a619c861..4a759e6e1 100644 --- a/src/domain/analysis/dependencies.ts +++ b/src/domain/analysis/dependencies.ts @@ -58,9 +58,32 @@ export function fileDepsData( * * Uses Repository.findCallers() so it works with both native and WASM engines. */ +type CallerRow = { id: number; name: string; kind: string; file: string; line: number }; + +/** Compute the next BFS frontier from a batched upstream-callers lookup. */ +function buildNextCallerFrontier( + unvisited: CallerRow[], + batchCallers: Map, + visited: Set, + noTests: boolean, +): CallerRow[] { + const nextFrontier: CallerRow[] = []; + const nextFrontierIds = new Set(); + for (const f of unvisited) { + const upstream = batchCallers.get(f.id) || []; + for (const u of upstream) { + if (noTests && isTestFile(u.file)) continue; + if (visited.has(u.id) || nextFrontierIds.has(u.id)) continue; + nextFrontierIds.add(u.id); + nextFrontier.push(u); + } + } + return nextFrontier; +} + function buildTransitiveCallers( repo: InstanceType, - callers: Array<{ id: number; name: string; kind: string; file: string; line: number }>, + callers: CallerRow[], nodeId: number, depth: number, noTests: boolean, @@ -81,18 +104,8 @@ function buildTransitiveCallers( if (unvisited.length === 0) break; const batchCallers = repo.findCallersBatch(unvisited.map((f) => f.id)); - const nextFrontier: typeof frontier = []; - const nextFrontierIds = new Set(); - for (const f of unvisited) { - const upstream = batchCallers.get(f.id) || []; - for (const u of upstream) { - if (noTests && isTestFile(u.file)) continue; - if (!visited.has(u.id) && !nextFrontierIds.has(u.id)) { - nextFrontierIds.add(u.id); - nextFrontier.push(u); - } - } - } + const nextFrontier = buildNextCallerFrontier(unvisited, batchCallers, visited, noTests); + if (nextFrontier.length > 0) { transitiveCallers[d] = nextFrontier.map((n) => ({ name: n.name, @@ -258,22 +271,30 @@ function resolveEndpoints( }; } -/** - * BFS from sourceId toward targetId. - * Returns { found, parent, alternateCount, foundDepth }. - * `parent` maps nodeId -> { parentId, edgeKind }. - */ -function bfsShortestPath( +type NeighborRow = { + id: number; + name: string; + kind: string; + file: string; + line: number; + edge_kind: string; +}; + +type BfsShortestState = { + visited: Set; + parent: Map; + found: boolean; + foundDepth: number; + alternateCount: number; +}; + +/** Build the SQL statement that yields neighbors of a node id in the requested direction. */ +function buildNeighborStmt( db: BetterSqlite3Database, - sourceId: number, - targetId: number, edgeKinds: string[], reverse: boolean, - maxDepth: number, - noTests: boolean, -) { +): ReturnType { const kindPlaceholders = edgeKinds.map(() => '?').join(', '); - // Forward: source_id -> target_id (A calls... calls B) // Reverse: target_id -> source_id (B is called by... called by A) const neighborQuery = reverse @@ -283,50 +304,78 @@ function bfsShortestPath( : `SELECT n.id, n.name, n.kind, n.file, n.line, e.kind AS edge_kind FROM edges e JOIN nodes n ON e.target_id = n.id WHERE e.source_id = ? AND e.kind IN (${kindPlaceholders})`; - const neighborStmt = db.prepare(neighborQuery); + return db.prepare(neighborQuery); +} + +/** Process a single neighbor row during BFS; returns true once the target has been reached. */ +function visitNeighbor( + n: NeighborRow, + currentId: number, + depth: number, + targetId: number, + state: BfsShortestState, + nextQueue: number[], + noTests: boolean, +): void { + if (noTests && isTestFile(n.file)) return; + if (n.id === targetId) { + if (!state.found) { + state.found = true; + state.foundDepth = depth; + state.parent.set(n.id, { parentId: currentId, edgeKind: n.edge_kind }); + } + state.alternateCount++; + return; + } + if (state.visited.has(n.id)) return; + state.visited.add(n.id); + state.parent.set(n.id, { parentId: currentId, edgeKind: n.edge_kind }); + nextQueue.push(n.id); +} - const visited = new Set([sourceId]); - const parent = new Map(); +/** + * BFS from sourceId toward targetId. + * Returns { found, parent, alternateCount, foundDepth }. + * `parent` maps nodeId -> { parentId, edgeKind }. + */ +function bfsShortestPath( + db: BetterSqlite3Database, + sourceId: number, + targetId: number, + edgeKinds: string[], + reverse: boolean, + maxDepth: number, + noTests: boolean, +) { + const neighborStmt = buildNeighborStmt(db, edgeKinds, reverse); + const state: BfsShortestState = { + visited: new Set([sourceId]), + parent: new Map(), + found: false, + foundDepth: -1, + alternateCount: 0, + }; let queue = [sourceId]; - let found = false; - let alternateCount = 0; - let foundDepth = -1; for (let depth = 1; depth <= maxDepth; depth++) { const nextQueue: number[] = []; for (const currentId of queue) { - const neighbors = neighborStmt.all(currentId, ...edgeKinds) as Array<{ - id: number; - name: string; - kind: string; - file: string; - line: number; - edge_kind: string; - }>; + const neighbors = neighborStmt.all(currentId, ...edgeKinds) as NeighborRow[]; for (const n of neighbors) { - if (noTests && isTestFile(n.file)) continue; - if (n.id === targetId) { - if (!found) { - found = true; - foundDepth = depth; - parent.set(n.id, { parentId: currentId, edgeKind: n.edge_kind }); - } - alternateCount++; - continue; - } - if (!visited.has(n.id)) { - visited.add(n.id); - parent.set(n.id, { parentId: currentId, edgeKind: n.edge_kind }); - nextQueue.push(n.id); - } + visitNeighbor(n, currentId, depth, targetId, state, nextQueue, noTests); } } - if (found) break; + if (state.found) break; queue = nextQueue; if (queue.length === 0) break; } - return { found, parent, alternateCount, foundDepth }; + return { + found: state.found, + parent: state.parent, + alternateCount: state.alternateCount, + foundDepth: state.foundDepth, + }; } /** @@ -474,6 +523,53 @@ export function pathData( // ── File-level shortest path ──────────────────────────────────────────── +type FileBfsState = { + visited: Set; + parentMap: Map; + found: boolean; + alternateCount: number; +}; + +/** Process a neighbor file during file-level BFS; updates state in place. */ +function visitFileNeighbor( + neighborFile: string, + currentFile: string, + targetFile: string, + state: FileBfsState, + nextQueue: string[], + noTests: boolean, +): void { + if (noTests && isTestFile(neighborFile)) return; + if (neighborFile === targetFile) { + if (!state.found) { + state.found = true; + state.parentMap.set(neighborFile, currentFile); + } + state.alternateCount++; + return; + } + if (state.visited.has(neighborFile)) return; + state.visited.add(neighborFile); + state.parentMap.set(neighborFile, currentFile); + nextQueue.push(neighborFile); +} + +/** Reconstruct file path from target back to source using parent links. */ +function reconstructFilePath( + parentMap: Map, + sourceFile: string, + targetFile: string, +): string[] { + const filePath: string[] = [targetFile]; + let cur = targetFile; + while (cur !== sourceFile) { + cur = parentMap.get(cur)!; + filePath.push(cur); + } + filePath.reverse(); + return filePath; +} + /** BFS over file adjacency graph to find shortest path. */ function bfsFilePath( neighborStmt: ReturnType, @@ -483,11 +579,13 @@ function bfsFilePath( maxDepth: number, noTests: boolean, ): { found: boolean; path: string[]; alternateCount: number } { - const visited = new Set([sourceFile]); - const parentMap = new Map(); + const state: FileBfsState = { + visited: new Set([sourceFile]), + parentMap: new Map(), + found: false, + alternateCount: 0, + }; let queue = [sourceFile]; - let found = false; - let alternateCount = 0; for (let depth = 1; depth <= maxDepth; depth++) { const nextQueue: string[] = []; @@ -496,38 +594,21 @@ function bfsFilePath( neighbor_file: string; }>; for (const n of neighbors) { - if (noTests && isTestFile(n.neighbor_file)) continue; - if (n.neighbor_file === targetFile) { - if (!found) { - found = true; - parentMap.set(n.neighbor_file, currentFile); - } - alternateCount++; - continue; - } - if (!visited.has(n.neighbor_file)) { - visited.add(n.neighbor_file); - parentMap.set(n.neighbor_file, currentFile); - nextQueue.push(n.neighbor_file); - } + visitFileNeighbor(n.neighbor_file, currentFile, targetFile, state, nextQueue, noTests); } } - if (found) break; + if (state.found) break; queue = nextQueue; if (queue.length === 0) break; } - if (!found) return { found: false, path: [], alternateCount: 0 }; + if (!state.found) return { found: false, path: [], alternateCount: 0 }; - // Reconstruct path - const filePath: string[] = [targetFile]; - let cur = targetFile; - while (cur !== sourceFile) { - cur = parentMap.get(cur)!; - filePath.push(cur); - } - filePath.reverse(); - return { found: true, path: filePath, alternateCount: Math.max(0, alternateCount - 1) }; + return { + found: true, + path: reconstructFilePath(state.parentMap, sourceFile, targetFile), + alternateCount: Math.max(0, state.alternateCount - 1), + }; } /** diff --git a/src/domain/analysis/fn-impact.ts b/src/domain/analysis/fn-impact.ts index f33ab26ff..e795c2092 100644 --- a/src/domain/analysis/fn-impact.ts +++ b/src/domain/analysis/fn-impact.ts @@ -83,6 +83,63 @@ function expandImplementors( } } +/** Record a caller node at depth `d`, adding to frontier and levels. */ +function recordCaller( + caller: RelatedNodeRow, + parentId: number, + depth: number, + visited: Set, + nextFrontier: number[], + levels: BfsLevels, + noTests: boolean, + onVisit?: BfsOnVisit, +): void { + if (visited.has(caller.id) || (noTests && isTestFile(caller.file))) return; + visited.add(caller.id); + nextFrontier.push(caller.id); + if (!levels[depth]) levels[depth] = []; + levels[depth]!.push(toSymbolRef(caller)); + if (onVisit) onVisit(caller, parentId, depth); +} + +/** Process all callers of one frontier node, recording new nodes and expanding implementors. */ +function processFrontierNode( + repo: InstanceType, + fid: number, + depth: number, + visited: Set, + nextFrontier: number[], + levels: BfsLevels, + noTests: boolean, + resolveImplementors: boolean, + onVisit?: BfsOnVisit, +): void { + const callers = repo.findDistinctCallers(fid) as RelatedNodeRow[]; + for (const c of callers) { + recordCaller(c, fid, depth, visited, nextFrontier, levels, noTests, onVisit); + if (resolveImplementors && INTERFACE_LIKE_KINDS.has(c.kind)) { + expandImplementors(repo, c.id, depth + 1, visited, nextFrontier, levels, noTests, onVisit); + } + } +} + +/** Seed BFS with implementors of the start node when it is an interface/trait. */ +function seedInterfaceImplementors( + repo: InstanceType, + startId: number, + visited: Set, + levels: BfsLevels, + noTests: boolean, + onVisit?: BfsOnVisit, +): number[] { + const implNextFrontier: number[] = []; + const startNode = repo.findNodeById(startId) as NodeRow | undefined; + if (startNode && INTERFACE_LIKE_KINDS.has(startNode.kind)) { + expandImplementors(repo, startId, 1, visited, implNextFrontier, levels, noTests, onVisit); + } + return implNextFrontier; +} + export function bfsTransitiveCallers( dbOrRepo: BetterSqlite3Database | InstanceType, startId: number, @@ -105,13 +162,9 @@ export function bfsTransitiveCallers( let frontier = [startId]; // Seed: if start node is an interface/trait, include its implementors at depth 1 - const implNextFrontier: number[] = []; - if (resolveImplementors) { - const startNode = repo.findNodeById(startId) as NodeRow | undefined; - if (startNode && INTERFACE_LIKE_KINDS.has(startNode.kind)) { - expandImplementors(repo, startId, 1, visited, implNextFrontier, levels, noTests, onVisit); - } - } + const implNextFrontier = resolveImplementors + ? seedInterfaceImplementors(repo, startId, visited, levels, noTests, onVisit) + : []; for (let d = 1; d <= maxDepth; d++) { if (d === 1 && implNextFrontier.length > 0) { @@ -119,19 +172,17 @@ export function bfsTransitiveCallers( } const nextFrontier: number[] = []; for (const fid of frontier) { - const callers = repo.findDistinctCallers(fid) as RelatedNodeRow[]; - for (const c of callers) { - if (!visited.has(c.id) && (!noTests || !isTestFile(c.file))) { - visited.add(c.id); - nextFrontier.push(c.id); - if (!levels[d]) levels[d] = []; - levels[d]!.push(toSymbolRef(c)); - if (onVisit) onVisit(c, fid, d); - } - if (resolveImplementors && INTERFACE_LIKE_KINDS.has(c.kind)) { - expandImplementors(repo, c.id, d + 1, visited, nextFrontier, levels, noTests, onVisit); - } - } + processFrontierNode( + repo, + fid, + d, + visited, + nextFrontier, + levels, + noTests, + resolveImplementors, + onVisit, + ); } frontier = nextFrontier; if (frontier.length === 0) break; @@ -140,6 +191,53 @@ export function bfsTransitiveCallers( return { totalDependents: visited.size - 1, levels }; } +/** BFS over import dependents, returning visited node IDs and depth-per-id map. */ +function bfsImportDependents( + repo: InstanceType, + seedNodes: NodeRow[], + noTests: boolean, +): { visited: Set; levels: Map } { + const visited = new Set(); + const queue: number[] = []; + const levels = new Map(); + + for (const fn of seedNodes) { + visited.add(fn.id); + queue.push(fn.id); + levels.set(fn.id, 0); + } + + while (queue.length > 0) { + const current = queue.shift()!; + const level = levels.get(current)!; + const dependents = repo.findImportDependents(current) as RelatedNodeRow[]; + for (const dep of dependents) { + if (visited.has(dep.id)) continue; + if (noTests && isTestFile(dep.file)) continue; + visited.add(dep.id); + queue.push(dep.id); + levels.set(dep.id, level + 1); + } + } + + return { visited, levels }; +} + +/** Group visited dependents by depth (excluding seed depth 0). */ +function groupDependentsByLevel( + repo: InstanceType, + levels: Map, +): Record> { + const byLevel: Record> = {}; + for (const [id, level] of levels) { + if (level === 0) continue; + if (!byLevel[level]) byLevel[level] = []; + const node = repo.findNodeById(id) as NodeRow | undefined; + if (node) byLevel[level].push({ file: node.file }); + } + return byLevel; +} + export function impactAnalysisData( file: string, customDbPath: string, @@ -152,36 +250,8 @@ export function impactAnalysisData( return { file, sources: [], levels: {}, totalDependents: 0 }; } - const visited = new Set(); - const queue: number[] = []; - const levels = new Map(); - - for (const fn of fileNodes) { - visited.add(fn.id); - queue.push(fn.id); - levels.set(fn.id, 0); - } - - while (queue.length > 0) { - const current = queue.shift()!; - const level = levels.get(current)!; - const dependents = repo.findImportDependents(current) as RelatedNodeRow[]; - for (const dep of dependents) { - if (!visited.has(dep.id) && (!noTests || !isTestFile(dep.file))) { - visited.add(dep.id); - queue.push(dep.id); - levels.set(dep.id, level + 1); - } - } - } - - const byLevel: Record> = {}; - for (const [id, level] of levels) { - if (level === 0) continue; - if (!byLevel[level]) byLevel[level] = []; - const node = repo.findNodeById(id) as NodeRow | undefined; - if (node) byLevel[level].push({ file: node.file }); - } + const { visited, levels } = bfsImportDependents(repo, fileNodes, noTests); + const byLevel = groupDependentsByLevel(repo, levels); return { file, diff --git a/src/domain/analysis/module-map.ts b/src/domain/analysis/module-map.ts index 887c644a9..71383c213 100644 --- a/src/domain/analysis/module-map.ts +++ b/src/domain/analysis/module-map.ts @@ -4,7 +4,7 @@ import { loadConfig } from '../../infrastructure/config.js'; import { debug } from '../../infrastructure/logger.js'; import { isTestFile } from '../../infrastructure/test-filter.js'; import { DEAD_ROLE_PREFIX } from '../../shared/kinds.js'; -import type { BetterSqlite3Database } from '../../types.js'; +import type { BetterSqlite3Database, NativeDatabase } from '../../types.js'; import { findCycles } from '../graph/cycles.js'; import { LANGUAGE_REGISTRY } from '../parser.js'; @@ -198,30 +198,13 @@ function computeQualityMetrics( ).c; const callConfidence = totalCallEdges > 0 ? highConfCallEdges / totalCallEdges : 0; - const fpRows = db - .prepare(` - SELECT n.name, n.file, n.line, COUNT(e.source_id) as caller_count - FROM nodes n - LEFT JOIN edges e ON n.id = e.target_id AND e.kind = 'calls' - WHERE n.kind IN ('function', 'method') - GROUP BY n.id - HAVING caller_count > ? - ORDER BY caller_count DESC - `) - .all(fpThreshold) as Array<{ name: string; file: string; line: number; caller_count: number }>; - const falsePositiveWarnings = fpRows - .filter((r) => - FALSE_POSITIVE_NAMES.has(r.name.includes('.') ? r.name.split('.').pop()! : r.name), - ) - .map((r) => ({ name: r.name, file: r.file, line: r.line, callerCount: r.caller_count })); + const falsePositiveWarnings = buildFalsePositiveWarnings(queryFalsePositiveRows(db, fpThreshold)); let fpEdgeCount = 0; for (const fp of falsePositiveWarnings) fpEdgeCount += fp.callerCount; const falsePositiveRatio = totalCallEdges > 0 ? fpEdgeCount / totalCallEdges : 0; - const score = Math.round( - callerCoverage * 40 + callConfidence * 40 + (1 - falsePositiveRatio) * 20, - ); + const score = computeQualityScore(callerCoverage, callConfidence, falsePositiveRatio); return { score, @@ -347,6 +330,169 @@ export function moduleMapData(customDbPath: string, limit = 20, opts: { noTests? } } +type FalsePositiveRow = { name: string; file: string; line: number; caller_count: number }; + +/** SQL query for false-positive caller counts above a threshold (shared by native and JS paths). */ +function queryFalsePositiveRows( + db: BetterSqlite3Database, + fpThreshold: number, +): FalsePositiveRow[] { + return db + .prepare(` + SELECT n.name, n.file, n.line, COUNT(e.source_id) as caller_count + FROM nodes n + LEFT JOIN edges e ON n.id = e.target_id AND e.kind = 'calls' + WHERE n.kind IN ('function', 'method') + GROUP BY n.id + HAVING caller_count > ? + ORDER BY caller_count DESC + `) + .all(fpThreshold) as FalsePositiveRow[]; +} + +/** Filter false-positive rows by the configured name set and shape them for the report. */ +function buildFalsePositiveWarnings(rows: FalsePositiveRow[]) { + return rows + .filter((r) => + FALSE_POSITIVE_NAMES.has(r.name.includes('.') ? r.name.split('.').pop()! : r.name), + ) + .map((r) => ({ name: r.name, file: r.file, line: r.line, callerCount: r.caller_count })); +} + +/** Compute the composite quality score (0-100) from coverage, confidence, and FP ratio. */ +function computeQualityScore( + callerCoverage: number, + callConfidence: number, + falsePositiveRatio: number, +): number { + return Math.round(callerCoverage * 40 + callConfidence * 40 + (1 - falsePositiveRatio) * 20); +} + +/** Aggregate role counts and derive the `dead` total. */ +function aggregateRolesFromNative(roleCounts: Array<{ role: string; count: number }>) { + const roles: Record & { dead?: number } = {}; + let deadTotal = 0; + for (const r of roleCounts) { + roles[r.role] = r.count; + if (r.role.startsWith(DEAD_ROLE_PREFIX)) deadTotal += r.count; + } + if (deadTotal > 0) roles.dead = deadTotal; + return roles; +} + +type NativeGraphStatsFn = NonNullable; +type NativeGraphStats = ReturnType; + +/** Build the native fast-path stats result by combining native aggregations with JS-only sections. */ +function buildStatsFromNative( + db: BetterSqlite3Database, + nativeStats: NativeGraphStats, + config: any, + jsSections: { + files: ReturnType; + fileCycles: unknown[]; + fnCycles: unknown[]; + }, +) { + const s = nativeStats; + const nodesByKind: Record = {}; + for (const k of s.nodesByKind) nodesByKind[k.kind] = k.count; + const edgesByKind: Record = {}; + for (const k of s.edgesByKind) edgesByKind[k.kind] = k.count; + const roles = aggregateRolesFromNative(s.roleCounts); + + const callerCoverage = + s.quality.callableTotal > 0 ? s.quality.callableWithCallers / s.quality.callableTotal : 0; + const callConfidence = + s.quality.callEdges > 0 ? s.quality.highConfCallEdges / s.quality.callEdges : 0; + + // False-positive analysis still uses JS (needs FALSE_POSITIVE_NAMES set) + const fpThreshold = config.analysis?.falsePositiveCallers ?? FALSE_POSITIVE_CALLER_THRESHOLD; + const falsePositiveWarnings = buildFalsePositiveWarnings(queryFalsePositiveRows(db, fpThreshold)); + let fpEdgeCount = 0; + for (const fp of falsePositiveWarnings) fpEdgeCount += fp.callerCount; + const falsePositiveRatio = s.quality.callEdges > 0 ? fpEdgeCount / s.quality.callEdges : 0; + const score = computeQualityScore(callerCoverage, callConfidence, falsePositiveRatio); + + return { + nodes: { total: s.totalNodes, byKind: nodesByKind }, + edges: { total: s.totalEdges, byKind: edgesByKind }, + files: jsSections.files, + cycles: { fileLevel: jsSections.fileCycles.length, functionLevel: jsSections.fnCycles.length }, + hotspots: s.hotspots.map((h) => ({ file: h.file, fanIn: h.fanIn, fanOut: h.fanOut })), + embeddings: s.embeddings + ? { + count: s.embeddings.count, + model: s.embeddings.model, + dim: s.embeddings.dim, + builtAt: s.embeddings.builtAt, + } + : null, + quality: { + score, + callerCoverage: { + ratio: callerCoverage, + covered: s.quality.callableWithCallers, + total: s.quality.callableTotal, + }, + callConfidence: { + ratio: callConfidence, + highConf: s.quality.highConfCallEdges, + total: s.quality.callEdges, + }, + falsePositiveWarnings, + }, + roles, + complexity: s.complexity + ? { + analyzed: s.complexity.analyzed, + avgCognitive: s.complexity.avgCognitive, + avgCyclomatic: s.complexity.avgCyclomatic, + maxCognitive: s.complexity.maxCognitive, + maxCyclomatic: s.complexity.maxCyclomatic, + avgMI: s.complexity.avgMi, + minMI: s.complexity.minMi, + } + : null, + }; +} + +/** Build the JS-fallback stats result using SQL aggregations from the helpers above. */ +function buildStatsFromJs( + db: BetterSqlite3Database, + noTests: boolean, + config: any, + jsSections: { + files: ReturnType; + fileCycles: unknown[]; + fnCycles: unknown[]; + }, +) { + const testFilter = testFilterSQL('n.file', noTests); + + const { total: totalNodes, byKind: nodesByKind } = countNodesByKind(db, noTests); + const { total: totalEdges, byKind: edgesByKind } = countEdgesByKind(db, noTests); + + const hotspots = findHotspots(db, noTests, 5); + const embeddings = getEmbeddingsInfo(db); + const fpThreshold = config.analysis?.falsePositiveCallers ?? FALSE_POSITIVE_CALLER_THRESHOLD; + const quality = computeQualityMetrics(db, testFilter, fpThreshold); + const roles = countRoles(db, noTests); + const complexity = getComplexitySummary(db, testFilter); + + return { + nodes: { total: totalNodes, byKind: nodesByKind }, + edges: { total: totalEdges, byKind: edgesByKind }, + files: jsSections.files, + cycles: { fileLevel: jsSections.fileCycles.length, functionLevel: jsSections.fnCycles.length }, + hotspots, + embeddings, + quality, + roles, + complexity, + }; +} + export function statsData(customDbPath: string, opts: { noTests?: boolean; config?: any } = {}) { const { db, nativeDb, close } = openReadonlyWithNative(customDbPath); try { @@ -354,127 +500,16 @@ export function statsData(customDbPath: string, opts: { noTests?: boolean; confi const config = opts.config || loadConfig(); // These always need JS (non-SQL logic) - const files = countFilesByLanguage(db, noTests); - const fileCycles = findCycles(db, { fileLevel: true, noTests }); - const fnCycles = findCycles(db, { fileLevel: false, noTests }); - - // ── Native fast path: batch all SQL aggregations in one napi call ── - if (nativeDb?.getGraphStats) { - const s = nativeDb.getGraphStats(noTests); - const nodesByKind: Record = {}; - for (const k of s.nodesByKind) nodesByKind[k.kind] = k.count; - const edgesByKind: Record = {}; - for (const k of s.edgesByKind) edgesByKind[k.kind] = k.count; - const roles: Record & { dead?: number } = {}; - let deadTotal = 0; - for (const r of s.roleCounts) { - roles[r.role] = r.count; - if (r.role.startsWith(DEAD_ROLE_PREFIX)) deadTotal += r.count; - } - if (deadTotal > 0) roles.dead = deadTotal; - - const callerCoverage = - s.quality.callableTotal > 0 ? s.quality.callableWithCallers / s.quality.callableTotal : 0; - const callConfidence = - s.quality.callEdges > 0 ? s.quality.highConfCallEdges / s.quality.callEdges : 0; - - // False-positive analysis still uses JS (needs FALSE_POSITIVE_NAMES set) - const fpThreshold = config.analysis?.falsePositiveCallers ?? FALSE_POSITIVE_CALLER_THRESHOLD; - const fpRows = db - .prepare(` - SELECT n.name, n.file, n.line, COUNT(e.source_id) as caller_count - FROM nodes n - LEFT JOIN edges e ON n.id = e.target_id AND e.kind = 'calls' - WHERE n.kind IN ('function', 'method') - GROUP BY n.id - HAVING caller_count > ? - ORDER BY caller_count DESC - `) - .all(fpThreshold) as Array<{ - name: string; - file: string; - line: number; - caller_count: number; - }>; - const falsePositiveWarnings = fpRows - .filter((r) => - FALSE_POSITIVE_NAMES.has(r.name.includes('.') ? r.name.split('.').pop()! : r.name), - ) - .map((r) => ({ name: r.name, file: r.file, line: r.line, callerCount: r.caller_count })); - let fpEdgeCount = 0; - for (const fp of falsePositiveWarnings) fpEdgeCount += fp.callerCount; - const falsePositiveRatio = s.quality.callEdges > 0 ? fpEdgeCount / s.quality.callEdges : 0; - const score = Math.round( - callerCoverage * 40 + callConfidence * 40 + (1 - falsePositiveRatio) * 20, - ); - - return { - nodes: { total: s.totalNodes, byKind: nodesByKind }, - edges: { total: s.totalEdges, byKind: edgesByKind }, - files, - cycles: { fileLevel: fileCycles.length, functionLevel: fnCycles.length }, - hotspots: s.hotspots.map((h) => ({ file: h.file, fanIn: h.fanIn, fanOut: h.fanOut })), - embeddings: s.embeddings - ? { - count: s.embeddings.count, - model: s.embeddings.model, - dim: s.embeddings.dim, - builtAt: s.embeddings.builtAt, - } - : null, - quality: { - score, - callerCoverage: { - ratio: callerCoverage, - covered: s.quality.callableWithCallers, - total: s.quality.callableTotal, - }, - callConfidence: { - ratio: callConfidence, - highConf: s.quality.highConfCallEdges, - total: s.quality.callEdges, - }, - falsePositiveWarnings, - }, - roles, - complexity: s.complexity - ? { - analyzed: s.complexity.analyzed, - avgCognitive: s.complexity.avgCognitive, - avgCyclomatic: s.complexity.avgCyclomatic, - maxCognitive: s.complexity.maxCognitive, - maxCyclomatic: s.complexity.maxCyclomatic, - avgMI: s.complexity.avgMi, - minMI: s.complexity.minMi, - } - : null, - }; - } - - // ── JS fallback ─────────────────────────────────────────────────── - const testFilter = testFilterSQL('n.file', noTests); - - const { total: totalNodes, byKind: nodesByKind } = countNodesByKind(db, noTests); - const { total: totalEdges, byKind: edgesByKind } = countEdgesByKind(db, noTests); - - const hotspots = findHotspots(db, noTests, 5); - const embeddings = getEmbeddingsInfo(db); - const fpThreshold = config.analysis?.falsePositiveCallers ?? FALSE_POSITIVE_CALLER_THRESHOLD; - const quality = computeQualityMetrics(db, testFilter, fpThreshold); - const roles = countRoles(db, noTests); - const complexity = getComplexitySummary(db, testFilter); - - return { - nodes: { total: totalNodes, byKind: nodesByKind }, - edges: { total: totalEdges, byKind: edgesByKind }, - files, - cycles: { fileLevel: fileCycles.length, functionLevel: fnCycles.length }, - hotspots, - embeddings, - quality, - roles, - complexity, + const jsSections = { + files: countFilesByLanguage(db, noTests), + fileCycles: findCycles(db, { fileLevel: true, noTests }), + fnCycles: findCycles(db, { fileLevel: false, noTests }), }; + + const nativeStats = nativeDb?.getGraphStats?.(noTests); + return nativeStats + ? buildStatsFromNative(db, nativeStats, config, jsSections) + : buildStatsFromJs(db, noTests, config, jsSections); } finally { close(); } From 4f344044829c684ffa40e4be6b214920c5ea5f36 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:36:57 -0600 Subject: [PATCH 17/27] refactor(search): decompose generator and reduce complexity in semantic and hybrid search --- src/domain/search/generator.ts | 206 +++++++++++++++++---------- src/domain/search/search/hybrid.ts | 95 ++++++------ src/domain/search/search/semantic.ts | 170 +++++++++++++--------- 3 files changed, 290 insertions(+), 181 deletions(-) diff --git a/src/domain/search/generator.ts b/src/domain/search/generator.ts index ef0ddf353..02e43f1ca 100644 --- a/src/domain/search/generator.ts +++ b/src/domain/search/generator.ts @@ -8,6 +8,19 @@ import { embed, getModelConfig } from './models.js'; import { buildSourceText } from './strategies/source.js'; import { buildStructuredText } from './strategies/structured.js'; +type EmbeddingNode = NodeRow & { id: number }; +type EmbeddingStrategy = 'structured' | 'source'; + +interface PreparedEmbeddings { + texts: string[]; + nodeIds: number[]; + nodeNames: string[]; + previews: string[]; + overflowCount: number; + filesRead: number; + filesSkipped: number; +} + /** * Rough token estimate (~4 chars per token for code/English). * Conservative — avoids adding a tokenizer dependency. @@ -47,47 +60,22 @@ function initEmbeddingsSchema(db: BetterSqlite3Database): void { `); } -export interface BuildEmbeddingsOptions { - strategy?: 'structured' | 'source'; -} - /** - * Build embeddings for all functions/methods/classes in the graph. + * Resolve the repo root for embedding. Prefer the root recorded at build time; + * fall back to `` only when the DB lives at the conventional + * `/.codegraph/graph.db` layout — otherwise trust the caller's rootDir. */ -export async function buildEmbeddings( - rootDir: string, - modelKey: string, - customDbPath?: string, - options: BuildEmbeddingsOptions = {}, -): Promise { - const strategy = options.strategy || 'structured'; - const dbPath = customDbPath || findDbPath(undefined); - - if (!fs.existsSync(dbPath)) { - throw new DbError( - `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, - { file: dbPath }, - ); - } - - const db = openDb(dbPath) as BetterSqlite3Database; - initEmbeddingsSchema(db); - - // Prefer the repo root recorded at build time — embed may be invoked from a - // different cwd (e.g. `codegraph embed --db /abs/path/graph.db`) and the - // positional rootDir will be wrong in that case. For legacy DBs without - // root_dir metadata, fall back to `` only when the DB lives at - // the conventional `/.codegraph/graph.db` layout — otherwise trust - // the caller-provided rootDir (which may be an explicit positional arg). - // `path.dirname(...)` is always non-empty (`'.'` at minimum), so the - // conventional-layout check is required to keep the rootDir path reachable. +function resolveRoot(db: BetterSqlite3Database, dbPath: string, rootDir: string): string { const metaRoot = getBuildMeta(db, 'root_dir'); const resolvedDbPath = path.resolve(dbPath); const dbDirName = path.basename(path.dirname(resolvedDbPath)); const dbParent = dbDirName === '.codegraph' ? path.dirname(path.dirname(resolvedDbPath)) : undefined; - const resolvedRoot = metaRoot || dbParent || rootDir; + return metaRoot || dbParent || rootDir; +} +/** Reset embedding tables and load eligible symbols grouped by file. */ +function loadNodesByFile(db: BetterSqlite3Database): Map { db.exec('DELETE FROM embeddings'); db.exec('DELETE FROM embedding_meta'); db.exec('DELETE FROM fts_index'); @@ -96,22 +84,52 @@ export async function buildEmbeddings( .prepare( `SELECT * FROM nodes WHERE kind IN ('function', 'method', 'class') ORDER BY file, line`, ) - .all() as Array; + .all() as EmbeddingNode[]; - console.log(`Building embeddings for ${nodes.length} symbols (strategy: ${strategy})...`); - - const byFile = new Map(); + const byFile = new Map(); for (const node of nodes) { if (!byFile.has(node.file)) byFile.set(node.file, []); byFile.get(node.file)?.push(node); } + return byFile; +} + +/** Build embedding text for a single node, truncating if it would overflow. */ +function buildNodeText( + node: EmbeddingNode, + file: string, + lines: string[], + db: BetterSqlite3Database, + strategy: EmbeddingStrategy, + contextWindow: number, +): { text: string; overflowed: boolean } { + let text = + strategy === 'structured' + ? buildStructuredText(node, file, lines, db) + : buildSourceText(node, file, lines); + const tokens = estimateTokens(text); + if (tokens > contextWindow) { + text = text.slice(0, contextWindow * 4); + return { text, overflowed: true }; + } + return { text, overflowed: false }; +} +/** + * Walk files in the graph, read source, and produce parallel arrays of + * texts / nodeIds / nodeNames / previews ready for embedding. + */ +function prepareEmbeddingTexts( + byFile: Map, + db: BetterSqlite3Database, + resolvedRoot: string, + strategy: EmbeddingStrategy, + contextWindow: number, +): PreparedEmbeddings { const texts: string[] = []; const nodeIds: number[] = []; const nodeNames: string[] = []; const previews: string[] = []; - const config = getModelConfig(modelKey); - const contextWindow = config.contextWindow; let overflowCount = 0; let filesRead = 0; let filesSkipped = 0; @@ -129,19 +147,8 @@ export async function buildEmbeddings( } for (const node of fileNodes) { - let text = - strategy === 'structured' - ? buildStructuredText(node, file, lines, db) - : buildSourceText(node, file, lines); - - // Detect and handle context window overflow - const tokens = estimateTokens(text); - if (tokens > contextWindow) { - overflowCount++; - const maxChars = contextWindow * 4; - text = text.slice(0, maxChars); - } - + const { text, overflowed } = buildNodeText(node, file, lines, db, strategy, contextWindow); + if (overflowed) overflowCount++; texts.push(text); nodeIds.push(node.id); nodeNames.push(node.name); @@ -149,28 +156,19 @@ export async function buildEmbeddings( } } - if (overflowCount > 0) { - warn( - `${overflowCount} symbol(s) exceeded model context window (${contextWindow} tokens) and were truncated`, - ); - } - - // If there were symbols to embed but every file failed to read, the DB was - // almost certainly built from a different location than the current cwd. - // Surface this clearly instead of emitting a silent "Stored 0 embeddings". - if (byFile.size > 0 && filesRead === 0) { - closeDb(db); - throw new DbError( - `embed: could not read any of the ${filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` + - `Tried resolving against: ${resolvedRoot}\n` + - 'Pass a positional argument pointing at the original repo root, or re-run "codegraph build" from that directory.', - { file: dbPath }, - ); - } - - console.log(`Embedding ${texts.length} symbols...`); - const { vectors, dim } = await embed(texts, modelKey); + return { texts, nodeIds, nodeNames, previews, overflowCount, filesRead, filesSkipped }; +} +/** Persist vectors, FTS rows, and embedding metadata in a single transaction. */ +function persistEmbeddings( + db: BetterSqlite3Database, + prepared: PreparedEmbeddings, + vectors: Float32Array[], + dim: number, + modelName: string, + strategy: EmbeddingStrategy, +): void { + const { nodeIds, nodeNames, previews, texts, overflowCount } = prepared; const insert = db.prepare( 'INSERT OR REPLACE INTO embeddings (node_id, vector, text_preview, full_text) VALUES (?, ?, ?, ?)', ); @@ -182,7 +180,7 @@ export async function buildEmbeddings( insert.run(nodeIds[i], Buffer.from(vec.buffer), previews[i], texts[i]); insertFts.run(nodeIds[i], nodeNames[i], texts[i]); } - insertMeta.run('model', config.name); + insertMeta.run('model', modelName); insertMeta.run('dim', String(dim)); insertMeta.run('count', String(vectors.length)); insertMeta.run('fts_count', String(vectors.length)); @@ -193,6 +191,66 @@ export async function buildEmbeddings( } }); insertAll(); +} + +export interface BuildEmbeddingsOptions { + strategy?: EmbeddingStrategy; +} + +/** + * Build embeddings for all functions/methods/classes in the graph. + */ +export async function buildEmbeddings( + rootDir: string, + modelKey: string, + customDbPath?: string, + options: BuildEmbeddingsOptions = {}, +): Promise { + const strategy = options.strategy || 'structured'; + const dbPath = customDbPath || findDbPath(undefined); + + if (!fs.existsSync(dbPath)) { + throw new DbError( + `No codegraph database found at ${dbPath}.\nRun "codegraph build" first to analyze your codebase.`, + { file: dbPath }, + ); + } + + const db = openDb(dbPath) as BetterSqlite3Database; + initEmbeddingsSchema(db); + + const resolvedRoot = resolveRoot(db, dbPath, rootDir); + const byFile = loadNodesByFile(db); + + const nodeCount = [...byFile.values()].reduce((acc, list) => acc + list.length, 0); + console.log(`Building embeddings for ${nodeCount} symbols (strategy: ${strategy})...`); + + const config = getModelConfig(modelKey); + const prepared = prepareEmbeddingTexts(byFile, db, resolvedRoot, strategy, config.contextWindow); + + if (prepared.overflowCount > 0) { + warn( + `${prepared.overflowCount} symbol(s) exceeded model context window (${config.contextWindow} tokens) and were truncated`, + ); + } + + // If there were symbols to embed but every file failed to read, the DB was + // almost certainly built from a different location than the current cwd. + // Surface this clearly instead of emitting a silent "Stored 0 embeddings". + if (byFile.size > 0 && prepared.filesRead === 0) { + closeDb(db); + throw new DbError( + `embed: could not read any of the ${prepared.filesSkipped} source files recorded in the graph — the DB may have been built from a different location than the current working directory.\n` + + `Tried resolving against: ${resolvedRoot}\n` + + 'Pass a positional argument pointing at the original repo root, or re-run "codegraph build" from that directory.', + { file: dbPath }, + ); + } + + console.log(`Embedding ${prepared.texts.length} symbols...`); + const { vectors, dim } = await embed(prepared.texts, modelKey); + + persistEmbeddings(db, prepared, vectors as Float32Array[], dim, config.name, strategy); console.log( `\nStored ${vectors.length} embeddings (${dim}d, ${config.name}, strategy: ${strategy}) in graph.db`, diff --git a/src/domain/search/search/hybrid.ts b/src/domain/search/search/hybrid.ts index ef7c2fc4c..bf6406c6c 100644 --- a/src/domain/search/search/hybrid.ts +++ b/src/domain/search/search/hybrid.ts @@ -105,61 +105,72 @@ async function collectRankedLists( return rankedLists; } +/** Initialise a fusion entry seeded from the first ranked item we see for a key. */ +function createFusionEntry(item: RankedItem): FusionEntry { + return { + name: item.name, + kind: item.kind, + file: item.file, + line: item.line, + endLine: (item.endLine as number | null) ?? null, + role: (item.role as string | null) ?? null, + fileHash: (item.fileHash as string | null) ?? null, + rrfScore: 0, + bm25Score: null, + bm25Rank: null, + similarity: null, + semanticRank: null, + }; +} + +/** Merge a single ranked item into its fusion entry: update RRF and best per-source rank. */ +function mergeRankedItem(entry: FusionEntry, item: RankedItem, k: number): void { + entry.rrfScore += 1 / (k + item.rank); + if (item.source === 'bm25') { + if (entry.bm25Rank === null || item.rank < entry.bm25Rank) { + entry.bm25Score = item.bm25Score ?? null; + entry.bm25Rank = item.rank; + } + } else if (entry.semanticRank === null || item.rank < entry.semanticRank) { + entry.similarity = item.similarity ?? null; + entry.semanticRank = item.rank; + } +} + +/** Flatten a fusion entry into the public-facing hybrid result shape. */ +function toHybridResult(e: FusionEntry): HybridResult { + return { + name: e.name, + kind: e.kind, + file: e.file, + line: e.line, + endLine: e.endLine, + role: e.role, + fileHash: e.fileHash, + rrf: e.rrfScore, + bm25Score: e.bm25Score, + bm25Rank: e.bm25Rank, + similarity: e.similarity, + semanticRank: e.semanticRank, + }; +} + /** Reciprocal Rank Fusion: merge ranked lists into a single scored result set. */ function fuseResults(rankedLists: RankedItem[][], k: number, limit: number): HybridResult[] { const fusionMap = new Map(); - for (const list of rankedLists) { for (const item of list) { if (!fusionMap.has(item.key)) { - fusionMap.set(item.key, { - name: item.name, - kind: item.kind, - file: item.file, - line: item.line, - endLine: (item.endLine as number | null) ?? null, - role: (item.role as string | null) ?? null, - fileHash: (item.fileHash as string | null) ?? null, - rrfScore: 0, - bm25Score: null, - bm25Rank: null, - similarity: null, - semanticRank: null, - }); - } - const entry = fusionMap.get(item.key)!; - entry.rrfScore += 1 / (k + item.rank); - if (item.source === 'bm25') { - if (entry.bm25Rank === null || item.rank < entry.bm25Rank) { - entry.bm25Score = (item as RankedItem & { bm25Score?: number }).bm25Score ?? null; - entry.bm25Rank = item.rank; - } - } else { - if (entry.semanticRank === null || item.rank < entry.semanticRank) { - entry.similarity = (item as RankedItem & { similarity?: number }).similarity ?? null; - entry.semanticRank = item.rank; - } + fusionMap.set(item.key, createFusionEntry(item)); } + mergeRankedItem(fusionMap.get(item.key)!, item, k); } } return [...fusionMap.values()] .sort((a, b) => b.rrfScore - a.rrfScore) .slice(0, limit) - .map((e) => ({ - name: e.name, - kind: e.kind, - file: e.file, - line: e.line, - endLine: e.endLine, - role: e.role, - fileHash: e.fileHash, - rrf: e.rrfScore, - bm25Score: e.bm25Score, - bm25Rank: e.bm25Rank, - similarity: e.similarity, - semanticRank: e.semanticRank, - })); + .map(toHybridResult); } export async function hybridSearchData( diff --git a/src/domain/search/search/semantic.ts b/src/domain/search/search/semantic.ts index 40e2f8870..2c0b82616 100644 --- a/src/domain/search/search/semantic.ts +++ b/src/domain/search/search/semantic.ts @@ -4,7 +4,7 @@ import type { BetterSqlite3Database, CodegraphConfig } from '../../../types.js'; import { normalizeSymbol } from '../../queries.js'; import { embed } from '../models.js'; import { cosineSim } from '../stores/sqlite-blob.js'; -import { prepareSearch } from './prepare.js'; +import { type PreparedSearch, prepareSearch } from './prepare.js'; export interface SemanticSearchOpts { config?: CodegraphConfig; @@ -30,6 +30,25 @@ export interface SearchDataResult { results: SemanticResult[]; } +type StoredRow = PreparedSearch['rows'][number]; + +/** Reconstitute a stored embedding row's vector blob into a Float32Array. */ +function rowVector(row: StoredRow): Float32Array { + return new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer); +} + +/** Warn when stored embeddings and the query model use different dimensions. */ +function checkDimensionMismatch(storedDim: number | null, dim: number): boolean { + if (storedDim && dim !== storedDim) { + console.log( + `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, + ); + console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); + return true; + } + return false; +} + export async function searchData( query: string, customDbPath: string | undefined, @@ -50,20 +69,12 @@ export async function searchData( dim, } = await embed([query], modelKey ?? undefined); - if (storedDim && dim !== storedDim) { - console.log( - `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, - ); - console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); - return null; - } + if (checkDimensionMismatch(storedDim, dim)) return null; const hc = new Map(); const results: SemanticResult[] = []; for (const row of rows) { - const vec = new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer); - const sim = cosineSim(queryVec!, vec); - + const sim = cosineSim(queryVec!, rowVector(row)); if (sim >= minScore) { results.push({ ...normalizeSymbol(row, db as BetterSqlite3Database, hc), @@ -91,6 +102,82 @@ export interface MultiSearchResult { }>; } +interface RankedHit { + rowIndex: number; + similarity: number; + rank: number; +} + +interface FusionEntry { + rrfScore: number; + queryScores: Array<{ query: string; similarity: number; rank: number }>; +} + +/** + * Emit a warning for any query pair whose embeddings are nearly identical, + * since RRF would over-weight matches shared between them. + */ +function warnOnSimilarQueries( + queries: string[], + queryVecs: Float32Array[], + threshold: number, +): void { + for (let i = 0; i < queryVecs.length; i++) { + for (let j = i + 1; j < queryVecs.length; j++) { + const sim = cosineSim(queryVecs[i]!, queryVecs[j]!); + if (sim >= threshold) { + warn( + `Queries "${queries[i]}" and "${queries[j]}" are very similar ` + + `(${(sim * 100).toFixed(0)}% cosine similarity). ` + + `This may bias RRF results toward their shared matches. ` + + `Consider using more distinct queries.`, + ); + } + } + } +} + +/** Rank stored rows for a single query, keeping only those above minScore. */ +function rankRowsForQuery( + queryVec: Float32Array, + rowVecs: Float32Array[], + minScore: number, +): RankedHit[] { + const scored: Array<{ rowIndex: number; similarity: number }> = []; + for (let ri = 0; ri < rowVecs.length; ri++) { + const sim = cosineSim(queryVec, rowVecs[ri]!); + if (sim >= minScore) { + scored.push({ rowIndex: ri, similarity: sim }); + } + } + scored.sort((a, b) => b.similarity - a.similarity); + return scored.map((item, rank) => ({ ...item, rank: rank + 1 })); +} + +/** Reciprocal Rank Fusion across each query's ranked hits. */ +function fuseRankedHits( + queries: string[], + perQueryRanked: RankedHit[][], + k: number, +): Map { + const fusionMap = new Map(); + for (let qi = 0; qi < queries.length; qi++) { + for (const item of perQueryRanked[qi]!) { + if (!fusionMap.has(item.rowIndex)) { + fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] }); + } + const entry = fusionMap.get(item.rowIndex)!; + entry.rrfScore += 1 / (k + item.rank); + entry.queryScores.push({ + query: queries[qi]!, + similarity: item.similarity, + rank: item.rank, + }); + } + } + return fusionMap; +} + export async function multiSearchData( queries: string[], customDbPath: string | undefined, @@ -101,6 +188,7 @@ export async function multiSearchData( const limit = opts.limit ?? searchCfg.topK ?? 15; const minScore = opts.minScore ?? searchCfg.defaultMinScore ?? 0.2; const k = opts.rrfK ?? searchCfg.rrfK ?? 60; + const similarityWarnThreshold = searchCfg.similarityWarnThreshold ?? 0.85; const prepared = prepareSearch(customDbPath, opts); if (!prepared) return null; @@ -109,63 +197,15 @@ export async function multiSearchData( try { const { vectors: queryVecs, dim } = await embed(queries, modelKey ?? undefined); - const SIMILARITY_WARN_THRESHOLD = searchCfg.similarityWarnThreshold ?? 0.85; - for (let i = 0; i < queryVecs.length; i++) { - for (let j = i + 1; j < queryVecs.length; j++) { - const sim = cosineSim(queryVecs[i]!, queryVecs[j]!); - if (sim >= SIMILARITY_WARN_THRESHOLD) { - warn( - `Queries "${queries[i]}" and "${queries[j]}" are very similar ` + - `(${(sim * 100).toFixed(0)}% cosine similarity). ` + - `This may bias RRF results toward their shared matches. ` + - `Consider using more distinct queries.`, - ); - } - } - } + warnOnSimilarQueries(queries, queryVecs as Float32Array[], similarityWarnThreshold); - if (storedDim && dim !== storedDim) { - console.log( - `Warning: query model dimension (${dim}) doesn't match stored embeddings (${storedDim}).`, - ); - console.log(` Re-run \`codegraph embed\` with the same model, or use --model to match.`); - return null; - } + if (checkDimensionMismatch(storedDim, dim)) return null; - const rowVecs = rows.map( - (row) => new Float32Array(new Uint8Array(row.vector as unknown as ArrayBuffer).buffer), + const rowVecs = rows.map(rowVector); + const perQueryRanked = queries.map((_q, qi) => + rankRowsForQuery(queryVecs[qi]!, rowVecs, minScore), ); - - const perQueryRanked = queries.map((_query, qi) => { - const scored: Array<{ rowIndex: number; similarity: number }> = []; - for (let ri = 0; ri < rows.length; ri++) { - const sim = cosineSim(queryVecs[qi]!, rowVecs[ri]!); - if (sim >= minScore) { - scored.push({ rowIndex: ri, similarity: sim }); - } - } - scored.sort((a, b) => b.similarity - a.similarity); - return scored.map((item, rank) => ({ ...item, rank: rank + 1 })); - }); - - const fusionMap = new Map< - number, - { rrfScore: number; queryScores: Array<{ query: string; similarity: number; rank: number }> } - >(); - for (let qi = 0; qi < queries.length; qi++) { - for (const item of perQueryRanked[qi]!) { - if (!fusionMap.has(item.rowIndex)) { - fusionMap.set(item.rowIndex, { rrfScore: 0, queryScores: [] }); - } - const entry = fusionMap.get(item.rowIndex)!; - entry.rrfScore += 1 / (k + item.rank); - entry.queryScores.push({ - query: queries[qi]!, - similarity: item.similarity, - rank: item.rank, - }); - } - } + const fusionMap = fuseRankedHits(queries, perQueryRanked, k); const hc = new Map(); const results: MultiSearchResult['results'] = []; From 0a12e8c8b730f06a53fad9318285efa30104c1a8 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:46:19 -0600 Subject: [PATCH 18/27] refactor(features): decompose complexity, structure, graph-enrichment, structure-query, and owners MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internal refactor — no public API or behaviour change, so docs check acknowledged. - complexity.ts: split collectNativeBulkRows (cog=70) into classify/build/collect-file helpers; extract classifyHalsteadToken + summarizeHalsteadCounts from computeHalsteadMetrics. - structure.ts: merge classifyNodeRolesFull/Incremental DRY via shared buildActiveFilesSet + buildClassifierInput helpers. - graph-enrichment.ts: decompose prepareFileLevelData (cog=32, cyc=26) into loadFileLevelEdges, computeFileFanCounts, detectFileCommunities, buildFileVisNode, selectFileSeedNodes. - structure-query.ts: split hotspotsData (cog=34, sloc=102) using a strategy pattern (HOTSPOT_ORDER_BY) and mapNative/JsHotspotRow helpers. - owners.ts: split ownersData (sloc=158, bugs=1.55) into loadFilteredFiles, buildOwnerIndex, loadSymbolsForFiles, computeOwnerBoundaries, buildOwnersSummary. --- src/features/complexity.ts | 213 ++++++++++++-------- src/features/graph-enrichment.ts | 175 +++++++++------- src/features/owners.ts | 330 +++++++++++++++++-------------- src/features/structure-query.ts | 173 ++++++++-------- src/features/structure.ts | 112 +++++------ 5 files changed, 573 insertions(+), 430 deletions(-) diff --git a/src/features/complexity.ts b/src/features/complexity.ts index 509d03478..5627b6e36 100644 --- a/src/features/complexity.ts +++ b/src/features/complexity.ts @@ -31,44 +31,36 @@ const COMPLEXITY_EXTENSIONS = buildExtensionSet(COMPLEXITY_RULES); // ─── Halstead Metrics Computation ───────────────────────────────────────── -export function computeHalsteadMetrics( - functionNode: TreeSitterNode, - language: string, -): HalsteadDerivedMetrics | null { - const rules = HALSTEAD_RULES.get(language) as HalsteadRules | undefined; - if (!rules) return null; - - const operators = new Map(); // type -> count - const operands = new Map(); // text -> count - - function walk(node: TreeSitterNode | null): void { - if (!node) return; - - // Skip type annotation subtrees - if (rules?.skipTypes.has(node.type)) return; +/** Classify a tree-sitter node as a Halstead operator or operand, + * updating the running counts. Pure helper extracted from computeHalsteadMetrics + * to keep the dispatcher thin. */ +function classifyHalsteadToken( + node: TreeSitterNode, + rules: HalsteadRules, + operators: Map, + operands: Map, +): void { + // Compound operators (non-leaf): count the node type as an operator + if (rules.compoundOperators.has(node.type)) { + operators.set(node.type, (operators.get(node.type) || 0) + 1); + } - // Compound operators (non-leaf): count the node type as an operator - if (rules?.compoundOperators.has(node.type)) { + // Leaf nodes: classify as operator or operand + if (node.childCount === 0) { + if (rules.operatorLeafTypes.has(node.type)) { operators.set(node.type, (operators.get(node.type) || 0) + 1); - } - - // Leaf nodes: classify as operator or operand - if (node.childCount === 0) { - if (rules?.operatorLeafTypes.has(node.type)) { - operators.set(node.type, (operators.get(node.type) || 0) + 1); - } else if (rules?.operandLeafTypes.has(node.type)) { - const text = node.text; - operands.set(text, (operands.get(text) || 0) + 1); - } - } - - for (let i = 0; i < node.childCount; i++) { - walk(node.child(i)); + } else if (rules.operandLeafTypes.has(node.type)) { + const text = node.text; + operands.set(text, (operands.get(text) || 0) + 1); } } +} - walk(functionNode); - +/** Build a HalsteadDerivedMetrics summary from the raw operator/operand counts. */ +function summarizeHalsteadCounts( + operators: Map, + operands: Map, +): HalsteadDerivedMetrics { const n1 = operators.size; // distinct operators const n2 = operands.size; // distinct operands let bigN1 = 0; // total operators @@ -79,7 +71,6 @@ export function computeHalsteadMetrics( const vocabulary = n1 + n2; const length = bigN1 + bigN2; - // Guard against zero const volume = vocabulary > 0 ? length * Math.log2(vocabulary) : 0; const difficulty = n2 > 0 ? (n1 / 2) * (bigN2 / n2) : 0; const effort = difficulty * volume; @@ -99,6 +90,31 @@ export function computeHalsteadMetrics( }; } +export function computeHalsteadMetrics( + functionNode: TreeSitterNode, + language: string, +): HalsteadDerivedMetrics | null { + const rules = HALSTEAD_RULES.get(language) as HalsteadRules | undefined; + if (!rules) return null; + + const operators = new Map(); // type -> count + const operands = new Map(); // text -> count + + function walk(node: TreeSitterNode | null): void { + if (!node) return; + // Skip type annotation subtrees + if (rules?.skipTypes.has(node.type)) return; + classifyHalsteadToken(node, rules as HalsteadRules, operators, operands); + for (let i = 0; i < node.childCount; i++) { + walk(node.child(i)); + } + } + + walk(functionNode); + + return summarizeHalsteadCounts(operators, operands); +} + // ─── LOC Metrics Computation ────────────────────────────────────────────── // Delegated to ast-analysis/metrics.js; re-exported for backward compatibility. export const computeLOCMetrics = _computeLOCMetrics; @@ -535,6 +551,89 @@ function upsertAstComplexity( return 1; } +/** Decision outcome for a single definition during native bulk-row collection. + * - 'skip': the definition is legitimately ignorable (non-function, missing line, + * interface stub, unsupported language). + * - 'fallback': a genuine function body is missing precomputed complexity — + * the whole native fast path must abort to JS. + * - 'emit': the definition has complexity data; the row was appended. */ +type NativeRowDecision = 'skip' | 'fallback' | 'emit'; + +/** Classify a definition relative to the native bulk path. Returns + * 'skip' to ignore it, 'fallback' to bail out, or 'emit' if the row was added. */ +function classifyDefinitionForNativeBulk( + def: FileSymbols['definitions'][0], + langSupported: boolean, +): 'skip' | 'fallback' | 'has-data' { + if (def.kind !== 'function' && def.kind !== 'method') return 'skip'; + if (!def.line) return 'skip'; + if (!def.complexity) { + // Interface/type property signatures and single-line stubs are extracted + // as methods but the native engine correctly never assigns complexity. + // Mirror the leniency in initWasmParsersIfNeeded to avoid bailing out + // of the native bulk-insert path for every TypeScript codebase (#846). + if (def.name.includes('.') || !def.endLine || def.endLine <= def.line) return 'skip'; + // Languages without complexity rules will never have data — skip them + // rather than bailing out of the entire native bulk path. + if (!langSupported) return 'skip'; + return 'fallback'; // genuine function body missing complexity — needs JS fallback + } + return 'has-data'; +} + +/** Build a single native-bulk row from a definition with complexity data. */ +function buildNativeBulkRow( + nodeId: number, + def: FileSymbols['definitions'][0], +): Record { + const ch = def.complexity?.halstead; + const cl = def.complexity?.loc; + return { + nodeId, + cognitive: def.complexity?.cognitive ?? 0, + cyclomatic: def.complexity?.cyclomatic ?? 0, + maxNesting: def.complexity?.maxNesting ?? 0, + loc: cl ? cl.loc : 0, + sloc: cl ? cl.sloc : 0, + commentLines: cl ? cl.commentLines : 0, + halsteadN1: ch ? ch.n1 : 0, + halsteadN2: ch ? ch.n2 : 0, + halsteadBigN1: ch ? ch.bigN1 : 0, + halsteadBigN2: ch ? ch.bigN2 : 0, + halsteadVocabulary: ch ? ch.vocabulary : 0, + halsteadLength: ch ? ch.length : 0, + halsteadVolume: ch ? ch.volume : 0, + halsteadDifficulty: ch ? ch.difficulty : 0, + halsteadEffort: ch ? ch.effort : 0, + halsteadBugs: ch ? ch.bugs : 0, + maintainabilityIndex: def.complexity?.maintainabilityIndex ?? 0, + }; +} + +/** Try to collect a single file's definitions into native-bulk rows. + * Returns 'fallback' if any definition forces a JS fallback. */ +function collectFileBulkRows( + db: BetterSqlite3Database, + relPath: string, + symbols: FileSymbols, + rows: Array>, +): NativeRowDecision { + const ext = path.extname(relPath).toLowerCase(); + const langId = symbols._langId || ''; + const langSupported = COMPLEXITY_EXTENSIONS.has(ext) || COMPLEXITY_RULES.has(langId); + + for (const def of symbols.definitions) { + const decision = classifyDefinitionForNativeBulk(def, langSupported); + if (decision === 'skip') continue; + if (decision === 'fallback') return 'fallback'; + + const nodeId = getFunctionNodeId(db, def.name, relPath, def.line); + if (!nodeId) continue; + rows.push(buildNativeBulkRow(nodeId, def)); + } + return 'emit'; +} + /** Collect native bulk-insert rows from precomputed complexity data. * Returns the rows array, or null if any definition is missing complexity * (signalling that JS fallback is needed). */ @@ -543,53 +642,9 @@ function collectNativeBulkRows( fileSymbols: Map, ): Array> | null { const rows: Array> = []; - for (const [relPath, symbols] of fileSymbols) { - const ext = path.extname(relPath).toLowerCase(); - const langId = symbols._langId || ''; - const langSupported = COMPLEXITY_EXTENSIONS.has(ext) || COMPLEXITY_RULES.has(langId); - - for (const def of symbols.definitions) { - if (def.kind !== 'function' && def.kind !== 'method') continue; - if (!def.line) continue; - // Interface/type property signatures and single-line stubs are extracted - // as methods but the native engine correctly never assigns complexity. - // Mirror the leniency in initWasmParsersIfNeeded to avoid bailing out - // of the native bulk-insert path for every TypeScript codebase (#846). - if (!def.complexity) { - if (def.name.includes('.') || !def.endLine || def.endLine <= def.line) continue; - // Languages without complexity rules will never have data — skip them - // rather than bailing out of the entire native bulk path. - if (!langSupported) continue; - return null; // genuine function body missing complexity — needs JS fallback - } - const nodeId = getFunctionNodeId(db, def.name, relPath, def.line); - if (!nodeId) continue; - const ch = def.complexity.halstead; - const cl = def.complexity.loc; - rows.push({ - nodeId, - cognitive: def.complexity.cognitive ?? 0, - cyclomatic: def.complexity.cyclomatic ?? 0, - maxNesting: def.complexity.maxNesting ?? 0, - loc: cl ? cl.loc : 0, - sloc: cl ? cl.sloc : 0, - commentLines: cl ? cl.commentLines : 0, - halsteadN1: ch ? ch.n1 : 0, - halsteadN2: ch ? ch.n2 : 0, - halsteadBigN1: ch ? ch.bigN1 : 0, - halsteadBigN2: ch ? ch.bigN2 : 0, - halsteadVocabulary: ch ? ch.vocabulary : 0, - halsteadLength: ch ? ch.length : 0, - halsteadVolume: ch ? ch.volume : 0, - halsteadDifficulty: ch ? ch.difficulty : 0, - halsteadEffort: ch ? ch.effort : 0, - halsteadBugs: ch ? ch.bugs : 0, - maintainabilityIndex: def.complexity.maintainabilityIndex ?? 0, - }); - } + if (collectFileBulkRows(db, relPath, symbols, rows) === 'fallback') return null; } - return rows; } diff --git a/src/features/graph-enrichment.ts b/src/features/graph-enrichment.ts index 564cc5004..886d09bf8 100644 --- a/src/features/graph-enrichment.ts +++ b/src/features/graph-enrichment.ts @@ -336,13 +336,13 @@ interface FileLevelEdge { target: string; } -function prepareFileLevelData( +/** Load file-level import/call edges from the DB and optionally exclude test files. */ +function loadFileLevelEdges( db: BetterSqlite3Database, noTests: boolean, minConf: number, - cfg: PlotConfig, -): GraphData { - let edges = db +): FileLevelEdge[] { + const edges = db .prepare( ` SELECT DISTINCT n1.file AS source, n2.file AS target @@ -354,73 +354,118 @@ function prepareFileLevelData( `, ) .all(minConf); - if (noTests) edges = edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target)); - - const files = new Set(); - for (const { source, target } of edges) { - files.add(source); - files.add(target); - } - - const fileIds = new Map(); - let idx = 0; - for (const f of files) fileIds.set(f, idx++); + return noTests ? edges.filter((e) => !isTestFile(e.source) && !isTestFile(e.target)) : edges; +} - // Fan-in/fan-out +/** Compute fan-in and fan-out for each file from a list of edges. */ +function computeFileFanCounts(edges: FileLevelEdge[]): { + fanInCount: Map; + fanOutCount: Map; +} { const fanInCount = new Map(); const fanOutCount = new Map(); for (const { source, target } of edges) { fanOutCount.set(source, (fanOutCount.get(source) || 0) + 1); fanInCount.set(target, (fanInCount.get(target) || 0) + 1); } + return { fanInCount, fanOutCount }; +} - // Communities via graph subsystem +/** Run Louvain community detection on the file-level graph. Returns empty map on failure. */ +function detectFileCommunities(files: Set, edges: FileLevelEdge[]): Map { const communityMap = new Map(); - if (files.size > 0) { - try { - const fileGraph = new CodeGraph(); - for (const f of files) fileGraph.addNode(f); - for (const { source, target } of edges) { - if (source !== target && !fileGraph.hasEdge(source, target)) - fileGraph.addEdge(source, target); - } - const { assignments } = louvainCommunities(fileGraph); - for (const [file, cid] of assignments) communityMap.set(file, cid); - } catch { - // ignore + if (files.size === 0) return communityMap; + try { + const fileGraph = new CodeGraph(); + for (const f of files) fileGraph.addNode(f); + for (const { source, target } of edges) { + if (source !== target && !fileGraph.hasEdge(source, target)) + fileGraph.addEdge(source, target); } + const { assignments } = louvainCommunities(fileGraph); + for (const [file, cid] of assignments) communityMap.set(file, cid); + } catch { + // louvain can fail on disconnected graphs } + return communityMap; +} - const visNodes: VisNode[] = [...files].map((f) => { - const id = fileIds.get(f)!; - const community = communityMap.get(f) ?? null; - const fanIn = fanInCount.get(f) || 0; - const fanOut = fanOutCount.get(f) || 0; - const directory = path.dirname(f); - const color: string = - cfg.colorBy === 'community' && community !== null - ? COMMUNITY_COLORS[community % COMMUNITY_COLORS.length] || '#ccc' - : cfg.nodeColors?.file || (DEFAULT_NODE_COLORS as Record).file || '#ccc'; - - return { - id, - label: path.basename(f), - title: f, - color, - kind: 'file', - role: '', - file: f, - line: 0, - community, - cognitive: null, - cyclomatic: null, - maintainabilityIndex: null, - fanIn, - fanOut, - directory, - risk: [], - }; - }); +/** Build a VisNode for a single file, applying color based on cfg.colorBy. */ +function buildFileVisNode( + file: string, + id: number, + community: number | null, + fanIn: number, + fanOut: number, + cfg: PlotConfig, +): VisNode { + const color: string = + cfg.colorBy === 'community' && community !== null + ? COMMUNITY_COLORS[community % COMMUNITY_COLORS.length] || '#ccc' + : cfg.nodeColors?.file || (DEFAULT_NODE_COLORS as Record).file || '#ccc'; + + return { + id, + label: path.basename(file), + title: file, + color, + kind: 'file', + role: '', + file, + line: 0, + community, + cognitive: null, + cyclomatic: null, + maintainabilityIndex: null, + fanIn, + fanOut, + directory: path.dirname(file), + risk: [], + }; +} + +/** Select seed node IDs for the file-level graph based on configured strategy. */ +function selectFileSeedNodes(visNodes: VisNode[], cfg: PlotConfig): (number | string)[] { + if (cfg.seedStrategy === 'top-fanin') { + const sorted = [...visNodes].sort((a, b) => b.fanIn - a.fanIn); + return sorted.slice(0, cfg.seedCount || 30).map((n) => n.id); + } + // Both 'entry' and the default fallback include every node — file-level graphs + // don't track per-file roles, so 'entry' has no meaningful filter. + return visNodes.map((n) => n.id); +} + +function prepareFileLevelData( + db: BetterSqlite3Database, + noTests: boolean, + minConf: number, + cfg: PlotConfig, +): GraphData { + const edges = loadFileLevelEdges(db, noTests, minConf); + + const files = new Set(); + for (const { source, target } of edges) { + files.add(source); + files.add(target); + } + + const fileIds = new Map(); + let idx = 0; + for (const f of files) fileIds.set(f, idx++); + + const { fanInCount, fanOutCount } = computeFileFanCounts(edges); + const communityMap = detectFileCommunities(files, edges); + + const visNodes: VisNode[] = [...files].map((f) => + buildFileVisNode( + f, + fileIds.get(f)!, + communityMap.get(f) ?? null, + fanInCount.get(f) || 0, + fanOutCount.get(f) || 0, + cfg, + ), + ); const visEdges: VisEdge[] = edges.map(({ source, target }, i) => ({ id: `e${i}`, @@ -428,17 +473,7 @@ function prepareFileLevelData( to: fileIds.get(target)!, })); - let seedNodeIds: (number | string)[]; - if (cfg.seedStrategy === 'top-fanin') { - const sorted = [...visNodes].sort((a, b) => b.fanIn - a.fanIn); - seedNodeIds = sorted.slice(0, cfg.seedCount || 30).map((n) => n.id); - } else if (cfg.seedStrategy === 'entry') { - seedNodeIds = visNodes.map((n) => n.id); - } else { - seedNodeIds = visNodes.map((n) => n.id); - } - - return { nodes: visNodes, edges: visEdges, seedNodeIds }; + return { nodes: visNodes, edges: visEdges, seedNodeIds: selectFileSeedNodes(visNodes, cfg) }; } // ─── HTML Generation (thin wrapper) ────────────────────────────────── diff --git a/src/features/owners.ts b/src/features/owners.ts index 5c278ce64..c0e81ca30 100644 --- a/src/features/owners.ts +++ b/src/features/owners.ts @@ -139,18 +139,25 @@ interface OwnersDataOpts { boundary?: boolean; } -export function ownersData( - customDbPath?: string, - opts: OwnersDataOpts = {}, -): { +interface OwnedSymbol { + name: string; + kind: string; + file: string; + line: number; + owners: string[]; +} + +interface OwnerBoundary { + from: OwnedSymbol; + to: OwnedSymbol; + edgeKind: string; +} + +interface OwnersDataResult { codeownersFile: string | null; files: { file: string; owners: string[] }[]; - symbols: { name: string; kind: string; file: string; line: number; owners: string[] }[]; - boundaries: { - from: { name: string; kind: string; file: string; line: number; owners: string[] }; - to: { name: string; kind: string; file: string; line: number; owners: string[] }; - edgeKind: string; - }[]; + symbols: OwnedSymbol[]; + boundaries: OwnerBoundary[]; summary: { totalFiles: number; ownedFiles: number; @@ -159,160 +166,191 @@ export function ownersData( ownerCount: number; byOwner: { owner: string; fileCount: number }[]; }; +} + +interface BetterSqlite3DatabaseLike { + prepare(sql: string): { all(...params: unknown[]): unknown[] }; + close(): void; +} + +function emptyOwnersResult(codeownersFile: string | null): OwnersDataResult { + return { + codeownersFile, + files: [], + symbols: [], + boundaries: [], + summary: { + totalFiles: 0, + ownedFiles: 0, + unownedFiles: 0, + coveragePercent: 0, + ownerCount: 0, + byOwner: [], + }, + }; +} + +/** Load all distinct files from the DB and apply test/file filters. */ +function loadFilteredFiles(db: BetterSqlite3DatabaseLike, opts: OwnersDataOpts): string[] { + let allFiles = (db.prepare('SELECT DISTINCT file FROM nodes').all() as { file: string }[]).map( + (r) => r.file, + ); + if (opts.noTests) allFiles = allFiles.filter((f) => !isTestFile(f)); + const fileFilters = normalizeFileFilter(opts.file); + if (fileFilters.length > 0) { + allFiles = allFiles.filter((f) => fileFilters.some((filter) => f.includes(filter))); + } + return allFiles; +} + +/** Build owner index (owner -> list of files) and count owned files. */ +function buildOwnerIndex(fileOwners: { file: string; owners: string[] }[]): { + ownerIndex: Map; + ownedCount: number; } { + const ownerIndex = new Map(); + let ownedCount = 0; + for (const fo of fileOwners) { + if (fo.owners.length > 0) ownedCount++; + for (const o of fo.owners) { + if (!ownerIndex.has(o)) ownerIndex.set(o, []); + ownerIndex.get(o)!.push(fo.file); + } + } + return { ownerIndex, ownedCount }; +} + +/** Load symbols restricted to the given file set, applying noTests and kind filters. */ +function loadSymbolsForFiles( + db: BetterSqlite3DatabaseLike, + fileSet: Set, + opts: OwnersDataOpts, + rules: CodeownersRule[], +): OwnedSymbol[] { + let symbols = ( + db.prepare('SELECT name, kind, file, line FROM nodes').all() as { + name: string; + kind: string; + file: string; + line: number; + }[] + ).filter((n) => fileSet.has(n.file)); + + if (opts.noTests) symbols = symbols.filter((s) => !isTestFile(s.file)); + if (opts.kind) symbols = symbols.filter((s) => s.kind === opts.kind); + + return symbols.map((s) => ({ ...s, owners: matchOwners(s.file, rules) })); +} + +interface CallEdgeRow { + id: number; + edgeKind: string; + srcName: string; + srcKind: string; + srcFile: string; + srcLine: number; + tgtName: string; + tgtKind: string; + tgtFile: string; + tgtLine: number; +} + +/** Compute cross-owner call boundaries. Returns empty array when boundary mode is off. */ +function computeOwnerBoundaries( + db: BetterSqlite3DatabaseLike, + rules: CodeownersRule[], + noTests: boolean, +): OwnerBoundary[] { + const edges = db + .prepare( + `SELECT e.id, e.kind AS edgeKind, + s.name AS srcName, s.kind AS srcKind, s.file AS srcFile, s.line AS srcLine, + t.name AS tgtName, t.kind AS tgtKind, t.file AS tgtFile, t.line AS tgtLine + FROM edges e + JOIN nodes s ON e.source_id = s.id + JOIN nodes t ON e.target_id = t.id + WHERE e.kind = 'calls'`, + ) + .all() as CallEdgeRow[]; + + const boundaries: OwnerBoundary[] = []; + for (const e of edges) { + if (noTests && (isTestFile(e.srcFile) || isTestFile(e.tgtFile))) continue; + const srcOwners = matchOwners(e.srcFile, rules); + const tgtOwners = matchOwners(e.tgtFile, rules); + // Cross-boundary: different owner sets (sort for deterministic comparison) + const srcKey = [...srcOwners].sort().join(','); + const tgtKey = [...tgtOwners].sort().join(','); + if (srcKey === tgtKey) continue; + boundaries.push({ + from: { + name: e.srcName, + kind: e.srcKind, + file: e.srcFile, + line: e.srcLine, + owners: srcOwners, + }, + to: { name: e.tgtName, kind: e.tgtKind, file: e.tgtFile, line: e.tgtLine, owners: tgtOwners }, + edgeKind: e.edgeKind, + }); + } + return boundaries; +} + +/** Build summary stats (totals, coverage, by-owner counts). */ +function buildOwnersSummary( + totalFiles: number, + ownedCount: number, + ownerIndex: Map, +): OwnersDataResult['summary'] { + const byOwner = [...ownerIndex.entries()] + .map(([owner, files]) => ({ owner, fileCount: files.length })) + .sort((a, b) => b.fileCount - a.fileCount); + + return { + totalFiles, + ownedFiles: ownedCount, + unownedFiles: totalFiles - ownedCount, + coveragePercent: totalFiles > 0 ? Math.round((ownedCount / totalFiles) * 100) : 0, + ownerCount: ownerIndex.size, + byOwner, + }; +} + +export function ownersData(customDbPath?: string, opts: OwnersDataOpts = {}): OwnersDataResult { const db = openReadonlyOrFail(customDbPath); try { const dbPath = findDbPath(customDbPath); const repoRoot = path.resolve(path.dirname(dbPath), '..'); const parsed = parseCodeowners(repoRoot); - if (!parsed) { - return { - codeownersFile: null, - files: [], - symbols: [], - boundaries: [], - summary: { - totalFiles: 0, - ownedFiles: 0, - unownedFiles: 0, - coveragePercent: 0, - ownerCount: 0, - byOwner: [], - }, - }; - } - - // Get all distinct files from nodes - let allFiles = (db.prepare('SELECT DISTINCT file FROM nodes').all() as { file: string }[]).map( - (r) => r.file, - ); + if (!parsed) return emptyOwnersResult(null); - if (opts.noTests) allFiles = allFiles.filter((f) => !isTestFile(f)); - const fileFilters = normalizeFileFilter(opts.file); - if (fileFilters.length > 0) { - allFiles = allFiles.filter((f) => fileFilters.some((filter) => f.includes(filter))); - } - - // Map files to owners - const fileOwners = allFiles.map((file) => ({ - file, - owners: matchOwners(file, parsed.rules), - })); - - // Build owner-to-files index - const ownerIndex = new Map(); - let ownedCount = 0; - for (const fo of fileOwners) { - if (fo.owners.length > 0) ownedCount++; - for (const o of fo.owners) { - if (!ownerIndex.has(o)) ownerIndex.set(o, []); - ownerIndex.get(o)!.push(fo.file); - } - } + // Stage 1: load files and bucket them by owner + const allFiles = loadFilteredFiles(db, opts); + const fileOwners = allFiles.map((file) => ({ file, owners: matchOwners(file, parsed.rules) })); + const { ownerIndex, ownedCount } = buildOwnerIndex(fileOwners); - // Filter files if --owner specified - let filteredFiles = fileOwners; - if (opts.owner) { - filteredFiles = fileOwners.filter((fo) => fo.owners.includes(opts.owner!)); - } + // Stage 2: apply optional --owner filter + const filteredFiles = opts.owner + ? fileOwners.filter((fo) => fo.owners.includes(opts.owner!)) + : fileOwners; - // Get symbols for filtered files + // Stage 3: load symbols for filtered files const fileSet = new Set(filteredFiles.map((fo) => fo.file)); - let symbols = ( - db.prepare('SELECT name, kind, file, line FROM nodes').all() as { - name: string; - kind: string; - file: string; - line: number; - }[] - ).filter((n) => fileSet.has(n.file)); - - if (opts.noTests) symbols = symbols.filter((s) => !isTestFile(s.file)); - if (opts.kind) symbols = symbols.filter((s) => s.kind === opts.kind); - - const symbolsWithOwners = symbols.map((s) => ({ - ...s, - owners: matchOwners(s.file, parsed.rules), - })); - - // Boundary analysis — cross-owner call edges - const boundaries: { - from: { name: string; kind: string; file: string; line: number; owners: string[] }; - to: { name: string; kind: string; file: string; line: number; owners: string[] }; - edgeKind: string; - }[] = []; - if (opts.boundary) { - const edges = db - .prepare( - `SELECT e.id, e.kind AS edgeKind, - s.name AS srcName, s.kind AS srcKind, s.file AS srcFile, s.line AS srcLine, - t.name AS tgtName, t.kind AS tgtKind, t.file AS tgtFile, t.line AS tgtLine - FROM edges e - JOIN nodes s ON e.source_id = s.id - JOIN nodes t ON e.target_id = t.id - WHERE e.kind = 'calls'`, - ) - .all() as { - id: number; - edgeKind: string; - srcName: string; - srcKind: string; - srcFile: string; - srcLine: number; - tgtName: string; - tgtKind: string; - tgtFile: string; - tgtLine: number; - }[]; - - for (const e of edges) { - if (opts.noTests && (isTestFile(e.srcFile) || isTestFile(e.tgtFile))) continue; - const srcOwners = matchOwners(e.srcFile, parsed.rules); - const tgtOwners = matchOwners(e.tgtFile, parsed.rules); - // Cross-boundary: different owner sets - const srcKey = srcOwners.sort().join(','); - const tgtKey = tgtOwners.sort().join(','); - if (srcKey !== tgtKey) { - boundaries.push({ - from: { - name: e.srcName, - kind: e.srcKind, - file: e.srcFile, - line: e.srcLine, - owners: srcOwners, - }, - to: { - name: e.tgtName, - kind: e.tgtKind, - file: e.tgtFile, - line: e.tgtLine, - owners: tgtOwners, - }, - edgeKind: e.edgeKind, - }); - } - } - } + const symbolsWithOwners = loadSymbolsForFiles(db, fileSet, opts, parsed.rules); - // Summary - const byOwner = [...ownerIndex.entries()] - .map(([owner, files]) => ({ owner, fileCount: files.length })) - .sort((a, b) => b.fileCount - a.fileCount); + // Stage 4: optional boundary analysis (cross-owner call edges) + const boundaries = opts.boundary + ? computeOwnerBoundaries(db, parsed.rules, opts.noTests ?? false) + : []; return { codeownersFile: parsed.path, files: filteredFiles, symbols: symbolsWithOwners, boundaries, - summary: { - totalFiles: allFiles.length, - ownedFiles: ownedCount, - unownedFiles: allFiles.length - ownedCount, - coveragePercent: allFiles.length > 0 ? Math.round((ownedCount / allFiles.length) * 100) : 0, - ownerCount: ownerIndex.size, - byOwner, - }, + summary: buildOwnersSummary(allFiles.length, ownedCount, ownerIndex), }; } finally { db.close(); diff --git a/src/features/structure-query.ts b/src/features/structure-query.ts index 21a9f6710..952bcf1b8 100644 --- a/src/features/structure-query.ts +++ b/src/features/structure-query.ts @@ -227,6 +227,96 @@ interface HotspotsDataOpts { noTests?: boolean; } +type HotspotEntry = { + name: string; + kind: string; + lineCount: number | null; + symbolCount: number | null; + importCount: number | null; + exportCount: number | null; + fanIn: number | null; + fanOut: number | null; + cohesion: number | null; + fileCount: number | null; + density: number; + coupling: number; +}; + +/** Compute density from either fileCount/symbolCount or lineCount/symbolCount. */ +function computeHotspotDensity( + symbolCount: number | null, + fileCount: number | null, + lineCount: number | null, +): number { + if ((fileCount ?? 0) > 0) return (symbolCount || 0) / (fileCount ?? 1); + if ((lineCount ?? 0) > 0) return (symbolCount || 0) / (lineCount ?? 1); + return 0; +} + +/** Map a native-engine hotspot row (camelCase keys) to the public HotspotEntry shape. */ +function mapNativeHotspotRow(r: { + name: string; + kind: string; + lineCount: number | null; + symbolCount: number | null; + importCount: number | null; + exportCount: number | null; + fanIn: number | null; + fanOut: number | null; + cohesion: number | null; + fileCount: number | null; +}): HotspotEntry { + return { + name: r.name, + kind: r.kind, + lineCount: r.lineCount, + symbolCount: r.symbolCount, + importCount: r.importCount, + exportCount: r.exportCount, + fanIn: r.fanIn, + fanOut: r.fanOut, + cohesion: r.cohesion, + fileCount: r.fileCount, + density: computeHotspotDensity(r.symbolCount, r.fileCount, r.lineCount), + coupling: (r.fanIn || 0) + (r.fanOut || 0), + }; +} + +/** Map a JS-path hotspot row (snake_case keys from SQLite) to the public HotspotEntry shape. */ +function mapJsHotspotRow(r: HotspotRow): HotspotEntry { + return { + name: r.name, + kind: r.kind, + lineCount: r.line_count, + symbolCount: r.symbol_count, + importCount: r.import_count, + exportCount: r.export_count, + fanIn: r.fan_in, + fanOut: r.fan_out, + cohesion: r.cohesion, + fileCount: r.file_count, + density: computeHotspotDensity(r.symbol_count, r.file_count, r.line_count), + coupling: (r.fan_in || 0) + (r.fan_out || 0), + }; +} + +/** ORDER BY clause for each ranking dimension (strategy pattern). */ +const HOTSPOT_ORDER_BY: Record = { + 'fan-in': 'nm.fan_in DESC NULLS LAST', + 'fan-out': 'nm.fan_out DESC NULLS LAST', + density: 'nm.symbol_count DESC NULLS LAST', + coupling: '(COALESCE(nm.fan_in, 0) + COALESCE(nm.fan_out, 0)) DESC NULLS LAST', +}; + +/** Build the JS-path SQL query for a given metric and test filter. */ +function buildHotspotQuery(metric: string, testFilter: string): string { + const orderBy = HOTSPOT_ORDER_BY[metric] ?? HOTSPOT_ORDER_BY['fan-in']; + return `SELECT n.name, n.kind, nm.line_count, nm.symbol_count, nm.import_count, nm.export_count, + nm.fan_in, nm.fan_out, nm.cohesion, nm.file_count + FROM nodes n JOIN node_metrics nm ON n.id = nm.node_id + WHERE n.kind = ? ${testFilter} ORDER BY ${orderBy} LIMIT ?`; +} + export function hotspotsData( customDbPath?: string, opts: HotspotsDataOpts = {}, @@ -242,96 +332,21 @@ export function hotspotsData( const level = opts.level || 'file'; const limit = opts.limit || 10; const noTests = opts.noTests || false; - const kind = level === 'directory' ? 'directory' : 'file'; - const mapRow = (r: { - name: string; - kind: string; - lineCount: number | null; - symbolCount: number | null; - importCount: number | null; - exportCount: number | null; - fanIn: number | null; - fanOut: number | null; - cohesion: number | null; - fileCount: number | null; - }) => ({ - name: r.name, - kind: r.kind, - lineCount: r.lineCount, - symbolCount: r.symbolCount, - importCount: r.importCount, - exportCount: r.exportCount, - fanIn: r.fanIn, - fanOut: r.fanOut, - cohesion: r.cohesion, - fileCount: r.fileCount, - density: - (r.fileCount ?? 0) > 0 - ? (r.symbolCount || 0) / (r.fileCount ?? 1) - : (r.lineCount ?? 0) > 0 - ? (r.symbolCount || 0) / (r.lineCount ?? 1) - : 0, - coupling: (r.fanIn || 0) + (r.fanOut || 0), - }); - // ── Native fast path: single query instead of 4 eagerly prepared ── if (nativeDb?.getHotspots) { const rows = nativeDb.getHotspots(kind, metric, noTests, limit); - const hotspots = rows.map(mapRow); + const hotspots = rows.map(mapNativeHotspotRow); const base = { metric, level, limit, hotspots }; return paginateResult(base, 'hotspots', { limit: opts.limit, offset: opts.offset }); } // ── JS fallback ─────────────────────────────────────────────────── const testFilter = testFilterSQL('n.name', noTests && kind === 'file'); - - const HOTSPOT_QUERIES: Record = { - 'fan-in': db.prepare(` - SELECT n.name, n.kind, nm.line_count, nm.symbol_count, nm.import_count, nm.export_count, - nm.fan_in, nm.fan_out, nm.cohesion, nm.file_count - FROM nodes n JOIN node_metrics nm ON n.id = nm.node_id - WHERE n.kind = ? ${testFilter} ORDER BY nm.fan_in DESC NULLS LAST LIMIT ?`), - 'fan-out': db.prepare(` - SELECT n.name, n.kind, nm.line_count, nm.symbol_count, nm.import_count, nm.export_count, - nm.fan_in, nm.fan_out, nm.cohesion, nm.file_count - FROM nodes n JOIN node_metrics nm ON n.id = nm.node_id - WHERE n.kind = ? ${testFilter} ORDER BY nm.fan_out DESC NULLS LAST LIMIT ?`), - density: db.prepare(` - SELECT n.name, n.kind, nm.line_count, nm.symbol_count, nm.import_count, nm.export_count, - nm.fan_in, nm.fan_out, nm.cohesion, nm.file_count - FROM nodes n JOIN node_metrics nm ON n.id = nm.node_id - WHERE n.kind = ? ${testFilter} ORDER BY nm.symbol_count DESC NULLS LAST LIMIT ?`), - coupling: db.prepare(` - SELECT n.name, n.kind, nm.line_count, nm.symbol_count, nm.import_count, nm.export_count, - nm.fan_in, nm.fan_out, nm.cohesion, nm.file_count - FROM nodes n JOIN node_metrics nm ON n.id = nm.node_id - WHERE n.kind = ? ${testFilter} ORDER BY (COALESCE(nm.fan_in, 0) + COALESCE(nm.fan_out, 0)) DESC NULLS LAST LIMIT ?`), - }; - - const stmt = HOTSPOT_QUERIES[metric] ?? HOTSPOT_QUERIES['fan-in']; - const rows = stmt!.all(kind, limit); - - const hotspots = rows.map((r) => ({ - name: r.name, - kind: r.kind, - lineCount: r.line_count, - symbolCount: r.symbol_count, - importCount: r.import_count, - exportCount: r.export_count, - fanIn: r.fan_in, - fanOut: r.fan_out, - cohesion: r.cohesion, - fileCount: r.file_count, - density: - (r.file_count ?? 0) > 0 - ? (r.symbol_count || 0) / (r.file_count ?? 1) - : (r.line_count ?? 0) > 0 - ? (r.symbol_count || 0) / (r.line_count ?? 1) - : 0, - coupling: (r.fan_in || 0) + (r.fan_out || 0), - })); + const stmt = db.prepare(buildHotspotQuery(metric, testFilter)); + const rows = stmt.all(kind, limit) as HotspotRow[]; + const hotspots = rows.map(mapJsHotspotRow); const base = { metric, level, limit, hotspots }; return paginateResult(base, 'hotspots', { limit: opts.limit, offset: opts.offset }); diff --git a/src/features/structure.ts b/src/features/structure.ts index 8fe6b5a9b..3e531cbad 100644 --- a/src/features/structure.ts +++ b/src/features/structure.ts @@ -532,6 +532,56 @@ function batchUpdateRoles( })(); } +interface CallableNodeRow { + id: number; + name: string; + kind: string; + file: string; + fan_in: number; + fan_out: number; +} + +/** Build the activeFiles set: files with at least one callable connected to the graph. */ +function buildActiveFilesSet(rows: CallableNodeRow[]): Set { + const activeFiles = new Set(); + for (const r of rows) { + if ((r.fan_in > 0 || r.fan_out > 0) && r.kind !== 'constant') { + activeFiles.add(r.file); + } + } + return activeFiles; +} + +/** Map callable rows to classifier input objects, attaching exported/prod-fan-in/active-file metadata. */ +function buildClassifierInput( + rows: CallableNodeRow[], + exportedIds: Set, + prodFanInMap: Map, + activeFiles: Set, +): Array<{ + id: string; + name: string; + kind: string; + file: string; + fanIn: number; + fanOut: number; + isExported: boolean; + productionFanIn: number; + hasActiveFileSiblings: boolean | undefined; +}> { + return rows.map((r) => ({ + id: String(r.id), + name: r.name, + kind: r.kind, + file: r.file, + fanIn: r.fan_in, + fanOut: r.fan_out, + isExported: exportedIds.has(r.id), + productionFanIn: prodFanInMap.get(r.id) || 0, + hasActiveFileSiblings: r.kind === 'constant' ? activeFiles.has(r.file) : undefined, + })); +} + function classifyNodeRolesFull(db: BetterSqlite3Database, emptySummary: RoleSummary): RoleSummary { // Leaf kinds (parameter, property) can never have callers/callees. // Classify them directly as dead-leaf without the expensive fan-in/fan-out JOINs. @@ -558,14 +608,7 @@ function classifyNodeRolesFull(db: BetterSqlite3Database, emptySummary: RoleSumm ) fo ON n.id = fo.source_id WHERE n.kind NOT IN ('file', 'directory', 'parameter', 'property')`, ) - .all() as { - id: number; - name: string; - kind: string; - file: string; - fan_in: number; - fan_out: number; - }[]; + .all() as CallableNodeRow[]; if (rows.length === 0 && leafRows.length === 0) return emptySummary; @@ -629,28 +672,9 @@ function classifyNodeRolesFull(db: BetterSqlite3Database, emptySummary: RoleSumm prodFanInMap.set(r.target_id, r.cnt); } - // Files with at least one callable (non-constant) connected to the graph. - // Constants in these files are likely consumed locally via identifier reference. - const activeFiles = new Set(); - for (const r of rows) { - if ((r.fan_in > 0 || r.fan_out > 0) && r.kind !== 'constant') { - activeFiles.add(r.file); - } - } - // Delegate classification to the pure-logic classifier - const classifierInput = rows.map((r) => ({ - id: String(r.id), - name: r.name, - kind: r.kind, - file: r.file, - fanIn: r.fan_in, - fanOut: r.fan_out, - isExported: exportedIds.has(r.id), - productionFanIn: prodFanInMap.get(r.id) || 0, - hasActiveFileSiblings: r.kind === 'constant' ? activeFiles.has(r.file) : undefined, - })); - + const activeFiles = buildActiveFilesSet(rows); + const classifierInput = buildClassifierInput(rows, exportedIds, prodFanInMap, activeFiles); const roleMap = classifyRoles(classifierInput); const { summary, idsByRole } = buildRoleSummary(rows, leafRows, roleMap, emptySummary); @@ -733,14 +757,7 @@ function classifyNodeRolesIncremental( WHERE n.kind NOT IN ('file', 'directory', 'parameter', 'property') AND n.file IN (${placeholders})`, ) - .all(...allAffectedFiles) as { - id: number; - name: string; - kind: string; - file: string; - fan_in: number; - fan_out: number; - }[]; + .all(...allAffectedFiles) as CallableNodeRow[]; if (rows.length === 0 && leafRows.length === 0) return emptySummary; @@ -810,25 +827,8 @@ function classifyNodeRolesIncremental( } // 5. Classify affected nodes using global medians - const activeFiles = new Set(); - for (const r of rows) { - if ((r.fan_in > 0 || r.fan_out > 0) && r.kind !== 'constant') { - activeFiles.add(r.file); - } - } - - const classifierInput = rows.map((r) => ({ - id: String(r.id), - name: r.name, - kind: r.kind, - file: r.file, - fanIn: r.fan_in, - fanOut: r.fan_out, - isExported: exportedIds.has(r.id), - productionFanIn: prodFanInMap.get(r.id) || 0, - hasActiveFileSiblings: r.kind === 'constant' ? activeFiles.has(r.file) : undefined, - })); - + const activeFiles = buildActiveFilesSet(rows); + const classifierInput = buildClassifierInput(rows, exportedIds, prodFanInMap, activeFiles); const roleMap = classifyRoles(classifierInput, globalMedians); // 6. Build summary (only for affected nodes) and update only those nodes From 76bd476826524bac52b17c730d62fdb9a1d1047c Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:50:59 -0600 Subject: [PATCH 19/27] refactor(features): reduce complexity in cfg and cochange --- src/features/cfg.ts | 203 ++++++++++++++++++++++++++------------- src/features/cochange.ts | 167 ++++++++++++++++++-------------- 2 files changed, 230 insertions(+), 140 deletions(-) diff --git a/src/features/cfg.ts b/src/features/cfg.ts index 7736c741b..7d07f27a8 100644 --- a/src/features/cfg.ts +++ b/src/features/cfg.ts @@ -365,79 +365,91 @@ function persistVisitorFileCfg( return count; } -export async function buildCFGData( +/** + * Build a single native bulk-insert entry for one definition. + * Returns null when the def has no CFG blocks or no associated node row. + */ +function buildNativeCfgEntry( db: BetterSqlite3Database, - fileSymbols: Map, - rootDir: string, - engineOpts?: { - nativeDb?: { bulkInsertCfg?(entries: Array>): number }; - suspendJsDb?: () => void; - resumeJsDb?: () => void; - }, -): Promise { - // Fast path: when all function/method defs already have native CFG data, - // skip WASM parser init, tree parsing, and JS visitor entirely — just persist. - const allNative = allCfgNative(fileSymbols); + def: Definition, + relPath: string, +): Record | null { + if (def.kind !== 'function' && def.kind !== 'method') return null; + if (!def.line) return null; + + const nodeId = getFunctionNodeId(db, def.name, relPath, def.line); + if (!nodeId) return null; + + const cfg = def.cfg as { blocks?: CfgBuildBlock[]; edges?: CfgBuildEdge[] } | undefined; + if (!cfg?.blocks?.length) return null; + + return { + nodeId, + blocks: cfg.blocks.map((b) => ({ + index: b.index, + blockType: b.type, + startLine: b.startLine ?? undefined, + endLine: b.endLine ?? undefined, + label: b.label ?? undefined, + })), + edges: (cfg.edges || []).map((e) => ({ + sourceIndex: e.sourceIndex, + targetIndex: e.targetIndex, + kind: e.kind, + })), + }; +} - // ── Native bulk-insert fast path ────────────────────────────────────── - // The Rust bulkInsertCfg handles delete-before-insert atomically on a - // single rusqlite connection, so there is no dual-connection WAL conflict. +/** + * Native bulk-insert fast path. The Rust bulkInsertCfg handles + * delete-before-insert atomically on a single rusqlite connection, so there + * is no dual-connection WAL conflict. Returns true if this path handled the + * request (caller should return early); false to fall through to WASM/JS. + */ +function tryNativeBulkInsertCfg( + db: BetterSqlite3Database, + fileSymbols: Map, + engineOpts: + | { + nativeDb?: { bulkInsertCfg?(entries: Array>): number }; + suspendJsDb?: () => void; + resumeJsDb?: () => void; + } + | undefined, +): boolean { const nativeDb = engineOpts?.nativeDb; - if (allNative && nativeDb?.bulkInsertCfg) { - const entries: Array> = []; - for (const [relPath, symbols] of fileSymbols) { - const ext = path.extname(relPath).toLowerCase(); - if (!CFG_EXTENSIONS.has(ext)) continue; + if (!nativeDb?.bulkInsertCfg) return false; - for (const def of symbols.definitions) { - if (def.kind !== 'function' && def.kind !== 'method') continue; - if (!def.line) continue; - - const nodeId = getFunctionNodeId(db, def.name, relPath, def.line); - if (!nodeId) continue; - - const cfg = def.cfg as { blocks?: CfgBuildBlock[]; edges?: CfgBuildEdge[] } | undefined; - if (!cfg?.blocks?.length) continue; - - entries.push({ - nodeId, - blocks: cfg.blocks.map((b) => ({ - index: b.index, - blockType: b.type, - startLine: b.startLine ?? undefined, - endLine: b.endLine ?? undefined, - label: b.label ?? undefined, - })), - edges: (cfg.edges || []).map((e) => ({ - sourceIndex: e.sourceIndex, - targetIndex: e.targetIndex, - kind: e.kind, - })), - }); - } - } + const entries: Array> = []; + for (const [relPath, symbols] of fileSymbols) { + const ext = path.extname(relPath).toLowerCase(); + if (!CFG_EXTENSIONS.has(ext)) continue; - if (entries.length > 0) { - let inserted = 0; - try { - engineOpts?.suspendJsDb?.(); - inserted = nativeDb.bulkInsertCfg(entries); - } finally { - engineOpts?.resumeJsDb?.(); - } - info(`CFG (native bulk): ${inserted} functions analyzed`); + for (const def of symbols.definitions) { + const entry = buildNativeCfgEntry(db, def, relPath); + if (entry) entries.push(entry); } - return; } - const extToLang = buildExtToLangMap(); - let parsers: unknown = null; - let getParserFn: unknown = null; - - if (!allNative) { - ({ parsers, getParserFn } = await initCfgParsers(fileSymbols)); + if (entries.length > 0) { + let inserted = 0; + try { + engineOpts?.suspendJsDb?.(); + inserted = nativeDb.bulkInsertCfg(entries); + } finally { + engineOpts?.resumeJsDb?.(); + } + info(`CFG (native bulk): ${inserted} functions analyzed`); } + return true; +} +interface CfgInsertStatements { + insertBlock: ReturnType; + insertEdge: ReturnType; +} + +function prepareCfgInsertStatements(db: BetterSqlite3Database): CfgInsertStatements { const insertBlock = db.prepare( `INSERT INTO cfg_blocks (function_node_id, block_index, block_type, start_line, end_line, label) VALUES (?, ?, ?, ?, ?, ?)`, @@ -446,15 +458,31 @@ export async function buildCFGData( `INSERT INTO cfg_edges (function_node_id, source_block_id, target_block_id, kind) VALUES (?, ?, ?, ?)`, ); - let analyzed = 0; + return { insertBlock, insertEdge }; +} +/** + * Persist CFG for every CFG-eligible file inside a single transaction. + * Dispatches to native fast path or visitor path per file. + */ +function persistAllFileCfgs( + db: BetterSqlite3Database, + fileSymbols: Map, + rootDir: string, + allNative: boolean, + extToLang: Map, + parsers: unknown, + getParserFn: unknown, + stmts: CfgInsertStatements, +): number { + let analyzed = 0; const tx = db.transaction(() => { for (const [relPath, symbols] of fileSymbols) { const ext = path.extname(relPath).toLowerCase(); if (!CFG_EXTENSIONS.has(ext)) continue; if (allNative && !symbols._tree) { - analyzed += persistNativeFileCfg(db, symbols, relPath, insertBlock, insertEdge); + analyzed += persistNativeFileCfg(db, symbols, relPath, stmts.insertBlock, stmts.insertEdge); continue; } @@ -466,13 +494,52 @@ export async function buildCFGData( extToLang, parsers, getParserFn, - insertBlock, - insertEdge, + stmts.insertBlock, + stmts.insertEdge, ); } }); - tx(); + return analyzed; +} + +export async function buildCFGData( + db: BetterSqlite3Database, + fileSymbols: Map, + rootDir: string, + engineOpts?: { + nativeDb?: { bulkInsertCfg?(entries: Array>): number }; + suspendJsDb?: () => void; + resumeJsDb?: () => void; + }, +): Promise { + // Fast path: when all function/method defs already have native CFG data, + // skip WASM parser init, tree parsing, and JS visitor entirely — just persist. + const allNative = allCfgNative(fileSymbols); + + if (allNative && tryNativeBulkInsertCfg(db, fileSymbols, engineOpts)) { + return; + } + + const extToLang = buildExtToLangMap(); + let parsers: unknown = null; + let getParserFn: unknown = null; + + if (!allNative) { + ({ parsers, getParserFn } = await initCfgParsers(fileSymbols)); + } + + const stmts = prepareCfgInsertStatements(db); + const analyzed = persistAllFileCfgs( + db, + fileSymbols, + rootDir, + allNative, + extToLang, + parsers, + getParserFn, + stmts, + ); if (analyzed > 0) { info(`CFG: ${analyzed} functions analyzed`); diff --git a/src/features/cochange.ts b/src/features/cochange.ts index ffda28d29..2c4b9c379 100644 --- a/src/features/cochange.ts +++ b/src/features/cochange.ts @@ -137,77 +137,50 @@ export function computeCoChanges( return { pairs: results, fileCommitCounts }; } -export function analyzeCoChanges( - customDbPath?: string, - opts: { - since?: string; - minSupport?: number; - maxFilesPerCommit?: number; - full?: boolean; - } = {}, -): - | { pairsFound: number; commitsScanned: number; since: string; minSupport: number } - | { error: string } { - const dbPath = findDbPath(customDbPath); - const db = openDb(dbPath); - initSchema(db); - - const repoRoot = path.resolve(path.dirname(dbPath), '..'); - - if (!fs.existsSync(path.join(repoRoot, '.git'))) { - closeDb(db); - return { error: `Not a git repository: ${repoRoot}` }; - } - - const since = opts.since || '1 year ago'; - const minSupport = opts.minSupport ?? 3; - const maxFilesPerCommit = opts.maxFilesPerCommit ?? 50; - - // Check for incremental state - let afterSha: string | null = null; - if (!opts.full) { - try { - const row = db - .prepare<{ value: string }>( - "SELECT value FROM co_change_meta WHERE key = 'last_analyzed_commit'", - ) - .get(); - if (row) afterSha = row.value; - } catch { - /* table may not exist yet */ - } +/** Read the SHA of the most recently analyzed commit (incremental state). */ +function loadLastAnalyzedSha(db: BetterSqlite3Database): string | null { + try { + const row = db + .prepare<{ value: string }>( + "SELECT value FROM co_change_meta WHERE key = 'last_analyzed_commit'", + ) + .get(); + return row ? row.value : null; + } catch { + /* table may not exist yet */ + return null; } +} - // If full re-scan, clear existing data - if (opts.full) { - db.exec('DELETE FROM co_changes'); - db.exec('DELETE FROM co_change_meta'); - db.exec('DELETE FROM file_commit_counts'); - } +/** Wipe all co-change tables for a full re-scan. */ +function clearCoChangeTables(db: BetterSqlite3Database): void { + db.exec('DELETE FROM co_changes'); + db.exec('DELETE FROM co_change_meta'); + db.exec('DELETE FROM file_commit_counts'); +} - // Collect known files from the graph for filtering - let knownFiles: Set | null = null; +/** Collect the set of files currently tracked by the graph for filtering. */ +function loadKnownFiles(db: BetterSqlite3Database): Set | null { try { const rows = db.prepare<{ file: string }>('SELECT DISTINCT file FROM nodes').all(); - knownFiles = new Set(rows.map((r) => r.file)); + return new Set(rows.map((r) => r.file)); } catch { /* nodes table may not exist */ + return null; } +} - const { commits } = scanGitHistory(repoRoot, { since, afterSha }); - const { pairs: coChanges, fileCommitCounts } = computeCoChanges(commits, { - minSupport, - maxFilesPerCommit, - knownFiles, - }); - - // Upsert per-file commit counts so Jaccard can be recomputed from totals +/** Upsert per-file commit counts and pair counts (Jaccard recomputed later). */ +function persistCoChangeResults( + db: BetterSqlite3Database, + fileCommitCounts: Map, + coChanges: Map, +): void { const fileCountUpsert = db.prepare(` INSERT INTO file_commit_counts (file, commit_count) VALUES (?, ?) ON CONFLICT(file) DO UPDATE SET commit_count = commit_count + excluded.commit_count `); - // Upsert pair counts (accumulate commit_count, jaccard placeholder — recomputed below) const pairUpsert = db.prepare(` INSERT INTO co_changes (file_a, file_b, commit_count, jaccard, last_commit_epoch) VALUES (?, ?, ?, 0, ?) @@ -226,24 +199,31 @@ export function analyzeCoChanges( } }); insertMany(); +} - // Recompute Jaccard for all affected pairs from total file commit counts - const affectedFiles = [...fileCommitCounts.keys()]; - if (affectedFiles.length > 0) { - const ph = affectedFiles.map(() => '?').join(','); - db.prepare(` - UPDATE co_changes SET jaccard = ( - SELECT CAST(co_changes.commit_count AS REAL) / ( - COALESCE(fa.commit_count, 0) + COALESCE(fb.commit_count, 0) - co_changes.commit_count - ) - FROM file_commit_counts fa, file_commit_counts fb - WHERE fa.file = co_changes.file_a AND fb.file = co_changes.file_b +/** Recompute Jaccard for every pair touching any file in `affectedFiles`. */ +function recomputeJaccardForAffected(db: BetterSqlite3Database, affectedFiles: string[]): void { + if (affectedFiles.length === 0) return; + const ph = affectedFiles.map(() => '?').join(','); + db.prepare(` + UPDATE co_changes SET jaccard = ( + SELECT CAST(co_changes.commit_count AS REAL) / ( + COALESCE(fa.commit_count, 0) + COALESCE(fb.commit_count, 0) - co_changes.commit_count ) - WHERE file_a IN (${ph}) OR file_b IN (${ph}) - `).run(...affectedFiles, ...affectedFiles); - } + FROM file_commit_counts fa, file_commit_counts fb + WHERE fa.file = co_changes.file_a AND fb.file = co_changes.file_b + ) + WHERE file_a IN (${ph}) OR file_b IN (${ph}) + `).run(...affectedFiles, ...affectedFiles); +} - // Update metadata +/** Update co_change_meta with the latest analyzer run parameters. */ +function updateCoChangeMeta( + db: BetterSqlite3Database, + commits: CommitEntry[], + since: string, + minSupport: number, +): void { const metaUpsert = db.prepare(` INSERT INTO co_change_meta (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value @@ -254,6 +234,49 @@ export function analyzeCoChanges( metaUpsert.run('analyzed_at', new Date().toISOString()); metaUpsert.run('since', since); metaUpsert.run('min_support', String(minSupport)); +} + +export function analyzeCoChanges( + customDbPath?: string, + opts: { + since?: string; + minSupport?: number; + maxFilesPerCommit?: number; + full?: boolean; + } = {}, +): + | { pairsFound: number; commitsScanned: number; since: string; minSupport: number } + | { error: string } { + const dbPath = findDbPath(customDbPath); + const db = openDb(dbPath); + initSchema(db); + + const repoRoot = path.resolve(path.dirname(dbPath), '..'); + + if (!fs.existsSync(path.join(repoRoot, '.git'))) { + closeDb(db); + return { error: `Not a git repository: ${repoRoot}` }; + } + + const since = opts.since || '1 year ago'; + const minSupport = opts.minSupport ?? 3; + const maxFilesPerCommit = opts.maxFilesPerCommit ?? 50; + + const afterSha = opts.full ? null : loadLastAnalyzedSha(db); + if (opts.full) clearCoChangeTables(db); + + const knownFiles = loadKnownFiles(db); + + const { commits } = scanGitHistory(repoRoot, { since, afterSha }); + const { pairs: coChanges, fileCommitCounts } = computeCoChanges(commits, { + minSupport, + maxFilesPerCommit, + knownFiles, + }); + + persistCoChangeResults(db, fileCommitCounts, coChanges); + recomputeJaccardForAffected(db, [...fileCommitCounts.keys()]); + updateCoChangeMeta(db, commits, since, minSupport); const totalPairs = db .prepare<{ cnt: number }>('SELECT COUNT(*) as cnt FROM co_changes') From 4240dfea791fce0b5357b4e22a178e239a52c380 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 13:56:46 -0600 Subject: [PATCH 20/27] refactor(graph): decompose Leiden optimiser and roles classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internal refactor — public APIs unchanged. docs check acknowledged. --- src/graph/algorithms/leiden/optimiser.ts | 229 ++++++++++++++--------- src/graph/classifiers/roles.ts | 118 ++++++------ 2 files changed, 206 insertions(+), 141 deletions(-) diff --git a/src/graph/algorithms/leiden/optimiser.ts b/src/graph/algorithms/leiden/optimiser.ts index 35c5b7b5e..5d6753c70 100644 --- a/src/graph/algorithms/leiden/optimiser.ts +++ b/src/graph/algorithms/leiden/optimiser.ts @@ -88,12 +88,10 @@ export function runLouvainUndirectedModularity( optionsInput: LeidenOptions = {}, ): LouvainResult { const options: NormalizedOptions = normalizeOptions(optionsInput); - let currentGraph: CodeGraph = graph; - const levels: LevelEntry[] = []; const rngSource = createRng(options.randomSeed); const random: () => number = () => rngSource.nextDouble(); - const baseGraphAdapter: GraphAdapter = makeGraphAdapter(currentGraph, { + const baseGraphAdapter: GraphAdapter = makeGraphAdapter(graph, { directed: options.directed, ...optionsInput, }); @@ -101,98 +99,27 @@ export function runLouvainUndirectedModularity( const originalToCurrent = new Int32Array(origN); for (let i = 0; i < origN; i++) originalToCurrent[i] = i; - let fixedNodeMask: Uint8Array | null = null; - if (options.fixedNodes) { - const fixed = new Uint8Array(origN); - const asSet: Set = - options.fixedNodes instanceof Set ? options.fixedNodes : new Set(options.fixedNodes); - for (const id of asSet) { - const idx = baseGraphAdapter.idToIndex.get(String(id)); - if (idx != null) fixed[idx] = 1; - } - fixedNodeMask = fixed; - } + const fixedNodeMask: Uint8Array | null = buildFixedNodeMask(baseGraphAdapter, options.fixedNodes); + const levels: LevelEntry[] = []; + let currentGraph: CodeGraph = graph; for (let level = 0; level < options.maxLevels; level++) { const graphAdapter: GraphAdapter = level === 0 ? baseGraphAdapter : makeGraphAdapter(currentGraph, { directed: options.directed, ...optionsInput }); - const partition: Partition = makePartition(graphAdapter); - partition.graph = graphAdapter; - partition.initializeAggregates(); - - const order = new Int32Array(graphAdapter.n); - for (let i = 0; i < graphAdapter.n; i++) order[i] = i; - - let improved: boolean = true; - let localPasses: number = 0; - const strategyCode: CandidateStrategyCode = options.candidateStrategyCode; - while (improved) { - improved = false; - localPasses++; - shuffleArrayInPlace(order, random); - for (let idx = 0; idx < order.length; idx++) { - const nodeIndex: number = order[idx]!; - if (level === 0 && fixedNodeMask && fixedNodeMask[nodeIndex]) continue; - const candidateCount: number = partition.accumulateNeighborCommunityEdgeWeights(nodeIndex); - const { bestCommunityId, bestGain } = findBestCommunityMove( - partition, - graphAdapter, - nodeIndex, - candidateCount, - strategyCode, - options, - random, - ); - if (bestCommunityId !== partition.nodeCommunity[nodeIndex]! && bestGain > GAIN_EPSILON) { - partition.moveNodeToCommunity(nodeIndex, bestCommunityId); - improved = true; - } - } - if (localPasses >= options.maxLocalPasses) break; - } - - renumberCommunities(partition, options.preserveLabels); - - let effectivePartition: Partition = partition; - if (options.refine) { - const refined: Partition = refineWithinCoarseCommunities( - graphAdapter, - partition, - random, - options, - level === 0 ? fixedNodeMask : null, - ); - // Post-refinement: split any disconnected communities into their - // connected components. This is the cheap O(V+E) alternative to - // checking gamma-connectedness on every candidate during refinement. - // A disconnected community violates even basic connectivity, so - // splitting is always correct. - splitDisconnectedCommunities(graphAdapter, refined); - renumberCommunities(refined, options.preserveLabels); - effectivePartition = refined; - } + const levelOutcome = runLevel( + graphAdapter, + options, + random, + level === 0 ? fixedNodeMask : null, + ); - levels.push({ graph: graphAdapter, partition: effectivePartition }); - const fineToCoarse: Int32Array = effectivePartition.nodeCommunity; - for (let i = 0; i < originalToCurrent.length; i++) { - originalToCurrent[i] = fineToCoarse[originalToCurrent[i]!]!; - } + levels.push({ graph: graphAdapter, partition: levelOutcome.effectivePartition }); + applyFineToCoarseMapping(originalToCurrent, levelOutcome.effectivePartition.nodeCommunity); - // Terminate when no further coarsening is possible. Check both the - // move-phase partition (did the greedy phase find merges?) and the - // effective partition that feeds buildCoarseGraph (would coarsening - // actually reduce the graph?). When refine is enabled the refined - // partition starts from singletons and may have more communities than - // the move phase found, so checking only effectivePartition would - // cause premature termination. - if ( - partition.communityCount === graphAdapter.n && - effectivePartition.communityCount === graphAdapter.n - ) - break; - currentGraph = buildCoarseGraph(graphAdapter, effectivePartition); + if (levelOutcome.terminate) break; + currentGraph = buildCoarseGraph(graphAdapter, levelOutcome.effectivePartition); } const last: LevelEntry = levels[levels.length - 1]!; @@ -206,6 +133,134 @@ export function runLouvainUndirectedModularity( }; } +/** + * Build a fixed-node mask aligned with the base graph adapter's node indices. + * Returns null when no fixed nodes are configured. + */ +function buildFixedNodeMask( + baseGraphAdapter: GraphAdapter, + fixedNodes: Set | string[] | undefined, +): Uint8Array | null { + if (!fixedNodes) return null; + const mask = new Uint8Array(baseGraphAdapter.n); + const asSet: Set = fixedNodes instanceof Set ? fixedNodes : new Set(fixedNodes); + for (const id of asSet) { + const idx = baseGraphAdapter.idToIndex.get(String(id)); + if (idx != null) mask[idx] = 1; + } + return mask; +} + +interface LevelOutcome { + effectivePartition: Partition; + terminate: boolean; +} + +/** + * Run one level of the Louvain/Leiden pipeline: greedy local-move phase, + * optional Leiden refinement, and a termination check. Returns the + * partition that feeds the next coarse graph plus a `terminate` flag set + * when no further coarsening is possible. + */ +function runLevel( + graphAdapter: GraphAdapter, + options: NormalizedOptions, + random: () => number, + fixedNodeMask: Uint8Array | null, +): LevelOutcome { + const partition: Partition = makePartition(graphAdapter); + partition.graph = graphAdapter; + partition.initializeAggregates(); + + runLocalMovePhase(graphAdapter, partition, options, random, fixedNodeMask); + renumberCommunities(partition, options.preserveLabels); + + let effectivePartition: Partition = partition; + if (options.refine) { + const refined: Partition = refineWithinCoarseCommunities( + graphAdapter, + partition, + random, + options, + fixedNodeMask, + ); + // Post-refinement: split any disconnected communities into their + // connected components. This is the cheap O(V+E) alternative to + // checking gamma-connectedness on every candidate during refinement. + // A disconnected community violates even basic connectivity, so + // splitting is always correct. + splitDisconnectedCommunities(graphAdapter, refined); + renumberCommunities(refined, options.preserveLabels); + effectivePartition = refined; + } + + // Terminate when no further coarsening is possible. Check both the + // move-phase partition (did the greedy phase find merges?) and the + // effective partition that feeds buildCoarseGraph (would coarsening + // actually reduce the graph?). When refine is enabled the refined + // partition starts from singletons and may have more communities than + // the move phase found, so checking only effectivePartition would + // cause premature termination. + const terminate = + partition.communityCount === graphAdapter.n && + effectivePartition.communityCount === graphAdapter.n; + return { effectivePartition, terminate }; +} + +/** + * Greedy local-move phase: iterate randomly over nodes, moving each to the + * best community among the candidate set. Loops until no improvement or + * `maxLocalPasses` is reached. + */ +function runLocalMovePhase( + graphAdapter: GraphAdapter, + partition: Partition, + options: NormalizedOptions, + random: () => number, + fixedNodeMask: Uint8Array | null, +): void { + const order = new Int32Array(graphAdapter.n); + for (let i = 0; i < graphAdapter.n; i++) order[i] = i; + + const strategyCode: CandidateStrategyCode = options.candidateStrategyCode; + let improved: boolean = true; + let localPasses: number = 0; + while (improved) { + improved = false; + localPasses++; + shuffleArrayInPlace(order, random); + for (let idx = 0; idx < order.length; idx++) { + const nodeIndex: number = order[idx]!; + if (fixedNodeMask?.[nodeIndex]) continue; + const candidateCount: number = partition.accumulateNeighborCommunityEdgeWeights(nodeIndex); + const { bestCommunityId, bestGain } = findBestCommunityMove( + partition, + graphAdapter, + nodeIndex, + candidateCount, + strategyCode, + options, + random, + ); + if (bestCommunityId !== partition.nodeCommunity[nodeIndex]! && bestGain > GAIN_EPSILON) { + partition.moveNodeToCommunity(nodeIndex, bestCommunityId); + improved = true; + } + } + if (localPasses >= options.maxLocalPasses) break; + } +} + +/** + * Compose the running `originalToCurrent` mapping with this level's + * fine→coarse community labels, in place. + */ +function applyFineToCoarseMapping(originalToCurrent: Int32Array, fineToCoarse: Int32Array): void { + for (let i = 0; i < originalToCurrent.length; i++) { + originalToCurrent[i] = fineToCoarse[originalToCurrent[i]!]!; + } +} + /** * Evaluate all candidate communities for a node and return the best move. * Encapsulates the four candidate-selection strategies (All, RandomAny, diff --git a/src/graph/classifiers/roles.ts b/src/graph/classifiers/roles.ts index 59fd3dcfa..834e3d5b8 100644 --- a/src/graph/classifiers/roles.ts +++ b/src/graph/classifiers/roles.ts @@ -78,6 +78,68 @@ export interface RoleClassificationNode { hasActiveFileSiblings?: boolean; } +/** + * Compute median fan-in and fan-out across nodes with non-zero values. + * Used as thresholds for "high" fan-in/out classification. + */ +function computeFanMedians(nodes: RoleClassificationNode[]): { fanIn: number; fanOut: number } { + const nonZeroFanIn = nodes + .filter((n) => n.fanIn > 0) + .map((n) => n.fanIn) + .sort((a, b) => a - b); + const nonZeroFanOut = nodes + .filter((n) => n.fanOut > 0) + .map((n) => n.fanOut) + .sort((a, b) => a - b); + return { fanIn: median(nonZeroFanIn), fanOut: median(nonZeroFanOut) }; +} + +/** + * Classify a node with `fanIn === 0` that is not exported. + * Covers framework-active constants, test-only callables, and the dead-* family. + */ +function classifyUnreferencedNode(node: RoleClassificationNode): Role { + if (node.kind === 'constant' && node.hasActiveFileSiblings) { + // Constants consumed via identifier reference (not calls) have no + // inbound call edges. If the same file has active callables, the + // constant is almost certainly used locally — classify as leaf. + return 'leaf'; + } + if (node.testOnlyFanIn != null && node.testOnlyFanIn > 0) return 'test-only'; + return classifyDeadSubRole(node); +} + +/** + * Pick a role from fan-in/fan-out shape: core/utility/adapter/leaf. + * Called after entry/test-only/dead cases have been ruled out. + */ +function classifyByFanShape(highIn: boolean, highOut: boolean): Role { + if (highIn && !highOut) return 'core'; + if (highIn && highOut) return 'utility'; + if (!highIn && highOut) return 'adapter'; + return 'leaf'; +} + +/** + * Apply role-classification rules to a single node. + * Order matters — framework entries are tagged first, then dead/test cases, + * then the fan-in/fan-out shape decides among the structural roles. + */ +function classifyNodeRole(node: RoleClassificationNode, medFanIn: number, medFanOut: number): Role { + if (FRAMEWORK_ENTRY_PREFIXES.some((p) => node.name.startsWith(p))) return 'entry'; + + if (node.fanIn === 0) { + return node.isExported ? 'entry' : classifyUnreferencedNode(node); + } + + const hasProdFanIn = typeof node.productionFanIn === 'number'; + if (hasProdFanIn && node.productionFanIn === 0 && !node.isExported) return 'test-only'; + + const highIn = node.fanIn >= medFanIn; + const highOut = node.fanOut >= medFanOut && node.fanOut > 0; + return classifyByFanShape(highIn, highOut); +} + /** * Classify nodes into architectural roles based on fan-in/fan-out metrics. */ @@ -87,63 +149,11 @@ export function classifyRoles( ): Map { if (nodes.length === 0) return new Map(); - let medFanIn: number; - let medFanOut: number; - if (medianOverrides) { - medFanIn = medianOverrides.fanIn; - medFanOut = medianOverrides.fanOut; - } else { - const nonZeroFanIn = nodes - .filter((n) => n.fanIn > 0) - .map((n) => n.fanIn) - .sort((a, b) => a - b); - const nonZeroFanOut = nodes - .filter((n) => n.fanOut > 0) - .map((n) => n.fanOut) - .sort((a, b) => a - b); - medFanIn = median(nonZeroFanIn); - medFanOut = median(nonZeroFanOut); - } + const { fanIn: medFanIn, fanOut: medFanOut } = medianOverrides ?? computeFanMedians(nodes); const result = new Map(); - for (const node of nodes) { - const highIn = node.fanIn >= medFanIn && node.fanIn > 0; - const highOut = node.fanOut >= medFanOut && node.fanOut > 0; - const hasProdFanIn = typeof node.productionFanIn === 'number'; - - let role: Role; - const isFrameworkEntry = FRAMEWORK_ENTRY_PREFIXES.some((p) => node.name.startsWith(p)); - if (isFrameworkEntry) { - role = 'entry'; - } else if (node.fanIn === 0 && !node.isExported) { - if (node.kind === 'constant' && node.hasActiveFileSiblings) { - // Constants consumed via identifier reference (not calls) have no - // inbound call edges. If the same file has active callables, the - // constant is almost certainly used locally — classify as leaf. - role = 'leaf'; - } else { - role = - node.testOnlyFanIn != null && node.testOnlyFanIn > 0 - ? 'test-only' - : classifyDeadSubRole(node); - } - } else if (node.fanIn === 0 && node.isExported) { - role = 'entry'; - } else if (hasProdFanIn && node.fanIn > 0 && node.productionFanIn === 0 && !node.isExported) { - role = 'test-only'; - } else if (highIn && !highOut) { - role = 'core'; - } else if (highIn && highOut) { - role = 'utility'; - } else if (!highIn && highOut) { - role = 'adapter'; - } else { - role = 'leaf'; - } - - result.set(node.id, role); + result.set(node.id, classifyNodeRole(node, medFanIn, medFanOut)); } - return result; } From 900af101b0a4f02a4ca951f39aa0f1313991039d Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 14:03:11 -0600 Subject: [PATCH 21/27] refactor(presentation): extract shared rendering helpers in cfg and flow --- src/presentation/cfg.ts | 80 ++++++++++++--------- src/presentation/flow.ts | 146 +++++++++++++++++++++++++-------------- 2 files changed, 143 insertions(+), 83 deletions(-) diff --git a/src/presentation/cfg.ts b/src/presentation/cfg.ts index 7970a413d..5e9762308 100644 --- a/src/presentation/cfg.ts +++ b/src/presentation/cfg.ts @@ -1,6 +1,8 @@ import { cfgData, cfgToDOT, cfgToMermaid } from '../features/cfg.js'; import { outputResult } from '../infrastructure/result-formatter.js'; +type CfgData = ReturnType; + interface CfgCliOpts { json?: boolean; ndjson?: boolean; @@ -36,13 +38,56 @@ interface CfgResultEntry { edges: CfgEdge[]; } +function renderBlockLocation(b: CfgBlock): string { + if (!b.startLine) return ''; + const endSuffix = b.endLine && b.endLine !== b.startLine ? `-${b.endLine}` : ''; + return ` L${b.startLine}${endSuffix}`; +} + +function printCfgBlocks(blocks: CfgBlock[]): void { + if (blocks.length === 0) return; + console.log('\n Blocks:'); + for (const b of blocks) { + const label = b.label ? ` (${b.label})` : ''; + console.log(` [${b.index}] ${b.type}${label}${renderBlockLocation(b)}`); + } +} + +function printCfgEdges(edges: CfgEdge[]): void { + if (edges.length === 0) return; + console.log('\n Edges:'); + for (const e of edges) { + console.log(` B${e.source} → B${e.target} [${e.kind}]`); + } +} + +function printCfgEntry(r: CfgResultEntry): void { + console.log(`\n${r.kind} ${r.name} (${r.file}:${r.line})`); + console.log('─'.repeat(60)); + console.log(` Blocks: ${r.summary.blockCount} Edges: ${r.summary.edgeCount}`); + printCfgBlocks(r.blocks); + printCfgEdges(r.edges); +} + +function tryRenderGraphFormat(format: string, data: CfgData): boolean { + if (format === 'dot') { + console.log(cfgToDOT(data)); + return true; + } + if (format === 'mermaid') { + console.log(cfgToMermaid(data)); + return true; + } + return false; +} + export function cfg(name: string, customDbPath: string | undefined, opts: CfgCliOpts = {}): void { const data = cfgData(name, customDbPath, opts); if (outputResult(data, 'results', opts)) return; if (data.warning) { - console.log(`\u26A0 ${data.warning}`); + console.log(`⚠ ${data.warning}`); return; } if (data.results.length === 0) { @@ -50,38 +95,9 @@ export function cfg(name: string, customDbPath: string | undefined, opts: CfgCli return; } - const format = opts.format || 'text'; - if (format === 'dot') { - console.log(cfgToDOT(data)); - return; - } - if (format === 'mermaid') { - console.log(cfgToMermaid(data)); - return; - } + if (tryRenderGraphFormat(opts.format || 'text', data)) return; - // Text format for (const r of data.results as CfgResultEntry[]) { - console.log(`\n${r.kind} ${r.name} (${r.file}:${r.line})`); - console.log('\u2500'.repeat(60)); - console.log(` Blocks: ${r.summary.blockCount} Edges: ${r.summary.edgeCount}`); - - if (r.blocks.length > 0) { - console.log('\n Blocks:'); - for (const b of r.blocks) { - const loc = b.startLine - ? ` L${b.startLine}${b.endLine && b.endLine !== b.startLine ? `-${b.endLine}` : ''}` - : ''; - const label = b.label ? ` (${b.label})` : ''; - console.log(` [${b.index}] ${b.type}${label}${loc}`); - } - } - - if (r.edges.length > 0) { - console.log('\n Edges:'); - for (const e of r.edges) { - console.log(` B${e.source} \u2192 B${e.target} [${e.kind}]`); - } - } + printCfgEntry(r); } } diff --git a/src/presentation/flow.ts b/src/presentation/flow.ts index 77cd2c512..e72210a23 100644 --- a/src/presentation/flow.ts +++ b/src/presentation/flow.ts @@ -16,54 +16,65 @@ interface FlowOpts { csv?: boolean; } -export function flow( - name: string | undefined, - dbPath: string | undefined, - opts: FlowOpts = {}, -): void { - if (opts.list) { - const data = listEntryPointsData(dbPath, { - noTests: opts.noTests, - limit: opts.limit, - offset: opts.offset, - }) as any; - if (outputResult(data, 'entries', opts)) return; - if (data.count === 0) { - console.log('No entry points found. Run "codegraph build" first.'); - return; - } - console.log(`\nEntry points (${data.count} total):\n`); - for (const [type, entries] of Object.entries( - data.byType as Record< - string, - Array<{ kind: string; name: string; file: string; line: number }> - >, - )) { - console.log(` ${type} (${entries.length}):`); - for (const e of entries) { - console.log(` [${kindIcon(e.kind)}] ${e.name} ${e.file}:${e.line}`); - } - console.log(); - } - return; - } +interface EntryPoint { + kind: string; + name: string; + file: string; + line: number; +} - if (!name) { - console.log( - 'Please provide a function or entry-point name. Use --list to see available entry points.', - ); - return; - } +interface FlowNode { + kind: string; + name: string; + file: string; + line: number; +} - const data = flowData(name, dbPath, opts) as any; - if (outputResult(data, 'steps', opts)) return; +interface FlowStep { + depth: number; + nodes: FlowNode[]; +} - if (!data.entry) { - console.log(`No matching entry point or function found for "${name}".`); +interface FlowCycle { + from: string; + to: string; + depth: number; +} + +interface FlowResult { + entry?: { kind: string; name: string; type: string; file: string; line: number }; + depth: number; + totalReached: number; + leaves: Array<{ name: string; file: string }>; + steps: FlowStep[]; + cycles: FlowCycle[]; + truncated?: boolean; +} + +function runListEntryPoints(dbPath: string | undefined, opts: FlowOpts): void { + const data = listEntryPointsData(dbPath, { + noTests: opts.noTests, + limit: opts.limit, + offset: opts.offset, + }) as { count: number; byType: Record }; + if (outputResult(data, 'entries', opts)) return; + if (data.count === 0) { + console.log('No entry points found. Run "codegraph build" first.'); return; } + console.log(`\nEntry points (${data.count} total):\n`); + for (const [type, entries] of Object.entries(data.byType)) { + console.log(` ${type} (${entries.length}):`); + for (const e of entries) { + console.log(` [${kindIcon(e.kind)}] ${e.name} ${e.file}:${e.line}`); + } + console.log(); + } +} +function printFlowHeader(data: FlowResult): void { const e = data.entry; + if (!e) return; const typeTag = e.type !== 'exported' ? ` (${e.type})` : ''; console.log(`\nFlow from: [${kindIcon(e.kind)}] ${e.name}${typeTag} ${e.file}:${e.line}`); console.log( @@ -73,27 +84,60 @@ export function flow( console.log(` (truncated at depth ${data.depth})`); } console.log(); +} +function isLeafNode(n: FlowNode, leaves: Array<{ name: string; file: string }>): boolean { + return leaves.some((l) => l.name === n.name && l.file === n.file); +} + +function printFlowSteps(data: FlowResult): void { if (data.steps.length === 0) { console.log(' (leaf node — no callees)'); return; } - for (const step of data.steps) { console.log(` depth ${step.depth}:`); for (const n of step.nodes) { - const isLeaf = data.leaves.some( - (l: { name: string; file: string }) => l.name === n.name && l.file === n.file, - ); - const leafTag = isLeaf ? ' [leaf]' : ''; + const leafTag = isLeafNode(n, data.leaves) ? ' [leaf]' : ''; console.log(` [${kindIcon(n.kind)}] ${n.name} ${n.file}:${n.line}${leafTag}`); } } +} + +function printFlowCycles(cycles: FlowCycle[]): void { + if (cycles.length === 0) return; + console.log('\n Cycles detected:'); + for (const c of cycles) { + console.log(` ${c.from} -> ${c.to} (at depth ${c.depth})`); + } +} - if (data.cycles.length > 0) { - console.log('\n Cycles detected:'); - for (const c of data.cycles) { - console.log(` ${c.from} -> ${c.to} (at depth ${c.depth})`); - } +export function flow( + name: string | undefined, + dbPath: string | undefined, + opts: FlowOpts = {}, +): void { + if (opts.list) { + runListEntryPoints(dbPath, opts); + return; } + + if (!name) { + console.log( + 'Please provide a function or entry-point name. Use --list to see available entry points.', + ); + return; + } + + const data = flowData(name, dbPath, opts) as unknown as FlowResult; + if (outputResult(data, 'steps', opts)) return; + + if (!data.entry) { + console.log(`No matching entry point or function found for "${name}".`); + return; + } + + printFlowHeader(data); + printFlowSteps(data); + printFlowCycles(data.cycles); } From 88bb7119a7517d8d4d8b4b422520050e2774bab1 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 14:09:03 -0600 Subject: [PATCH 22/27] refactor(scripts): separate config from execution in benchmarking scripts --- scripts/lib/bench-config.ts | 257 +++++++++++++++++-------------- scripts/token-benchmark.ts | 291 +++++++++++++++--------------------- 2 files changed, 263 insertions(+), 285 deletions(-) diff --git a/scripts/lib/bench-config.ts b/scripts/lib/bench-config.ts index 44000d0a7..bc58473af 100644 --- a/scripts/lib/bench-config.ts +++ b/scripts/lib/bench-config.ts @@ -125,153 +125,179 @@ export function parseArgs() { return { version, npm, dist }; } -/** - * Resolve where to import codegraph source from. - * - * @returns {{ version: string, srcDir: string, cleanup: () => void }} - * - version: "dev" (local) or the semver string (npm) - * - srcDir: absolute path to the codegraph src/ directory to import from - * - cleanup: call when done — removes the temp dir in npm mode, no-op otherwise - */ -export async function resolveBenchmarkSource() { - const { version: cliVersion, npm, dist } = parseArgs(); - - if (dist && npm) { - console.error('Warning: --dist is ignored in --npm mode (the installed package already uses dist/ automatically).'); - } +/** Resolve repo root from this module's URL (handles Windows drive prefix). */ +function repoRoot(): string { + return path.resolve( + path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, '$1')), + '..', + '..', + ); +} - if (!npm) { - // Local mode — use repo src/ (or dist/ when --dist), version from git state - const root = path.resolve(path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, '$1')), '..', '..'); - const pkg = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8')); - let srcDir = path.join(root, 'src'); - if (dist) { - const distDir = path.join(root, 'dist'); - if (!fs.existsSync(distDir)) { - throw new Error(`--dist requested but ${distDir} does not exist. Run "npm run build" first.`); - } - srcDir = distDir; +/** Local-mode resolution: use repo src/ (or dist/ when --dist). */ +function resolveBenchmarkSourceLocal(cliVersion: string | null, dist: boolean) { + const root = repoRoot(); + const pkg = JSON.parse(fs.readFileSync(path.join(root, 'package.json'), 'utf8')); + let srcDir = path.join(root, 'src'); + if (dist) { + const distDir = path.join(root, 'dist'); + if (!fs.existsSync(distDir)) { + throw new Error(`--dist requested but ${distDir} does not exist. Run "npm run build" first.`); } - return { - version: cliVersion || getBenchmarkVersion(pkg.version, root), - srcDir, - cleanup() {}, - }; + srcDir = distDir; } + return { + version: cliVersion || getBenchmarkVersion(pkg.version, root), + srcDir, + cleanup() {}, + }; +} - // npm mode — install @optave/codegraph@ into a temp dir. - // Validate the version up-front so we never log or interpolate an - // unvalidated string (with `shell: true` on Windows, bad input would be a - // shell-injection surface). - const safeVersion = assertSafePkgVersion(cliVersion || 'latest'); - const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-bench-')); - - console.error(`Installing @optave/codegraph@${safeVersion} into ${tmpDir}...`); - - // Write a minimal package.json so npm install works - fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ private: true })); - - // Retry with backoff for npm propagation delays - const maxRetries = 5; +/** + * Run `npm install ` in `cwd` with exponential-backoff retries. + * `label` is used only for diagnostic logging. + */ +async function npmInstallWithRetries( + spec: string, + cwd: string, + maxRetries: number, + label: string, + extraFlags: readonly string[] = [], +): Promise { for (let attempt = 1; attempt <= maxRetries; attempt++) { try { - execFileSync('npm', ['install', `@optave/codegraph@${safeVersion}`, '--no-audit', '--no-fund'], { - cwd: tmpDir, + execFileSync('npm', ['install', spec, '--no-audit', '--no-fund', ...extraFlags], { + cwd, stdio: 'pipe', timeout: 120_000, shell: NPM_SHELL, }); - break; + return; } catch (err) { - if (attempt === maxRetries) { - // Clean up before throwing - fs.rmSync(tmpDir, { recursive: true, force: true }); - throw new Error(`Failed to install @optave/codegraph@${safeVersion} after ${maxRetries} attempts: ${err.message}`); - } + if (attempt === maxRetries) throw err; const delay = attempt * 15_000; // 15s, 30s, 45s, 60s - console.error(` Attempt ${attempt} failed, retrying in ${delay / 1000}s...`); + console.error(` ${label} attempt ${attempt} failed, retrying in ${delay / 1000}s...`); await new Promise((resolve) => setTimeout(resolve, delay)); } } +} - const pkgDir = path.join(tmpDir, 'node_modules', '@optave', 'codegraph'); +/** Install @optave/codegraph@ into a fresh tmp dir; returns paths. */ +async function installCodegraphPackage(cliVersion: string | null): Promise<{ tmpDir: string; pkgDir: string; installedPkg: any; safeVersion: string }> { + const safeVersion = assertSafePkgVersion(cliVersion || 'latest'); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-bench-')); + + console.error(`Installing @optave/codegraph@${safeVersion} into ${tmpDir}...`); + fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ private: true })); + try { + await npmInstallWithRetries(`@optave/codegraph@${safeVersion}`, tmpDir, 5, 'Attempt'); + } catch (err) { + fs.rmSync(tmpDir, { recursive: true, force: true }); + throw new Error(`Failed to install @optave/codegraph@${safeVersion} after 5 attempts: ${(err as Error).message}`); + } + + const pkgDir = path.join(tmpDir, 'node_modules', '@optave', 'codegraph'); const installedPkg = JSON.parse(fs.readFileSync(path.join(pkgDir, 'package.json'), 'utf8')); + return { tmpDir, pkgDir, installedPkg, safeVersion }; +} - // npm does not transitively install optionalDependencies of a dependency, - // so the platform-specific native binary is missing. Install it explicitly. +/** Detect platform-specific native package key (e.g. `codegraph-linux-x64-gnu`). */ +function detectNativePlatformKey(): string { + const platform = os.platform(); + const arch = os.arch(); + let libcSuffix = ''; + if (platform === 'linux') { + try { + const files = fs.readdirSync('/lib'); + libcSuffix = files.some((f) => f.startsWith('ld-musl-') && f.endsWith('.so.1')) ? '-musl' : '-gnu'; + } catch { + libcSuffix = '-gnu'; + } + } + return `codegraph-${platform}-${arch}${libcSuffix}`; +} + +/** + * npm does not transitively install optionalDependencies of a dependency, + * so the platform-specific native binary is missing. Install it explicitly. + * Failures are logged and swallowed — benchmark can still run on WASM. + */ +async function installNativePackage(tmpDir: string, installedPkg: any): Promise { try { const optDeps = installedPkg.optionalDependencies || {}; - const platform = os.platform(); - const arch = os.arch(); - let libcSuffix = ''; - if (platform === 'linux') { - try { - const files = fs.readdirSync('/lib'); - libcSuffix = files.some((f) => f.startsWith('ld-musl-') && f.endsWith('.so.1')) ? '-musl' : '-gnu'; - } catch { - libcSuffix = '-gnu'; - } - } - const platformKey = `codegraph-${platform}-${arch}${libcSuffix}`; + const platformKey = detectNativePlatformKey(); const nativePkg = Object.keys(optDeps).find((name) => name.includes(platformKey)); - if (nativePkg) { - // Even though these originate from the installed package's - // optionalDependencies (i.e. the npm registry), validate before - // logging or interpolating into a `shell: true` command line. - const safeNativePkg = assertSafePkgName(nativePkg); - const safeNativeVersion = assertSafePkgVersion(optDeps[nativePkg]); - console.error(`Installing native package ${safeNativePkg}@${safeNativeVersion}...`); - for (let attempt = 1; attempt <= maxRetries; attempt++) { - try { - execFileSync('npm', ['install', `${safeNativePkg}@${safeNativeVersion}`, '--no-audit', '--no-fund', '--no-save'], { - cwd: tmpDir, - stdio: 'pipe', - timeout: 120_000, - shell: NPM_SHELL, - }); - break; - } catch (innerErr) { - if (attempt === maxRetries) throw innerErr; - const delay = attempt * 15_000; - console.error(` Native install attempt ${attempt} failed, retrying in ${delay / 1000}s...`); - await new Promise((resolve) => setTimeout(resolve, delay)); - } - } - console.error(`Installed ${safeNativePkg}@${safeNativeVersion}`); - } else { - console.error(`No native package found for platform ${platform}-${arch}${libcSuffix}, skipping`); + if (!nativePkg) { + console.error(`No native package found for platform ${platformKey}, skipping`); + return; } + // Even though these originate from the installed package's + // optionalDependencies (i.e. the npm registry), validate before + // logging or interpolating into a `shell: true` command line. + const safeNativePkg = assertSafePkgName(nativePkg); + const safeNativeVersion = assertSafePkgVersion(optDeps[nativePkg]); + console.error(`Installing native package ${safeNativePkg}@${safeNativeVersion}...`); + await npmInstallWithRetries( + `${safeNativePkg}@${safeNativeVersion}`, + tmpDir, + 5, + 'Native install', + ['--no-save'], + ); + console.error(`Installed ${safeNativePkg}@${safeNativeVersion}`); } catch (err) { - console.error(`Warning: failed to install native package: ${err.message}`); + console.error(`Warning: failed to install native package: ${(err as Error).message}`); } +} - // @huggingface/transformers is a devDependency (lazy-loaded for embeddings). - // It is not installed as a transitive dep in npm mode, so install it - // explicitly so the embedding benchmark workers can import it. +/** + * @huggingface/transformers is a devDependency (lazy-loaded for embeddings). + * Not installed as a transitive dep in npm mode, so install it explicitly so + * the embedding benchmark workers can import it. Failures are logged + swallowed. + */ +async function installTransformers(tmpDir: string): Promise { try { const localPkg = JSON.parse( - fs.readFileSync(path.resolve(path.dirname(new URL(import.meta.url).pathname.replace(/^\/([A-Z]:)/, '$1')), '..', '..', 'package.json'), 'utf8'), + fs.readFileSync(path.join(repoRoot(), 'package.json'), 'utf8'), ); const hfVersion = localPkg.devDependencies?.['@huggingface/transformers']; - if (hfVersion) { - const safeHfVersion = assertSafePkgVersion(hfVersion); - console.error(`Installing @huggingface/transformers@${safeHfVersion} for embedding benchmarks...`); - execFileSync( - 'npm', - ['install', `@huggingface/transformers@${safeHfVersion}`, '--no-audit', '--no-fund', '--no-save'], - { - cwd: tmpDir, - stdio: 'pipe', - timeout: 120_000, - shell: NPM_SHELL, - }, - ); - console.error('Installed @huggingface/transformers'); - } + if (!hfVersion) return; + const safeHfVersion = assertSafePkgVersion(hfVersion); + console.error(`Installing @huggingface/transformers@${safeHfVersion} for embedding benchmarks...`); + execFileSync( + 'npm', + ['install', `@huggingface/transformers@${safeHfVersion}`, '--no-audit', '--no-fund', '--no-save'], + { cwd: tmpDir, stdio: 'pipe', timeout: 120_000, shell: NPM_SHELL }, + ); + console.error('Installed @huggingface/transformers'); } catch (err) { - console.error(`Warning: failed to install @huggingface/transformers: ${err.message}`); + console.error(`Warning: failed to install @huggingface/transformers: ${(err as Error).message}`); } +} + +/** + * Resolve where to import codegraph source from. + * + * @returns {{ version: string, srcDir: string, cleanup: () => void }} + * - version: "dev" (local) or the semver string (npm) + * - srcDir: absolute path to the codegraph src/ directory to import from + * - cleanup: call when done — removes the temp dir in npm mode, no-op otherwise + */ +export async function resolveBenchmarkSource() { + const { version: cliVersion, npm, dist } = parseArgs(); + + if (dist && npm) { + console.error('Warning: --dist is ignored in --npm mode (the installed package already uses dist/ automatically).'); + } + + if (!npm) { + return resolveBenchmarkSourceLocal(cliVersion, dist); + } + + const { tmpDir, pkgDir, installedPkg } = await installCodegraphPackage(cliVersion); + await installNativePackage(tmpDir, installedPkg); + await installTransformers(tmpDir); // v3.4.0+ publishes compiled JS in dist/ alongside raw TS in src/. // Node cannot strip types from node_modules, so prefer dist/ when available. @@ -284,7 +310,6 @@ export async function resolveBenchmarkSource() { } const resolvedVersion = cliVersion || installedPkg.version; - console.error(`Installed @optave/codegraph@${installedPkg.version}`); return { diff --git a/scripts/token-benchmark.ts b/scripts/token-benchmark.ts index 02e053bc4..1aa4d3951 100644 --- a/scripts/token-benchmark.ts +++ b/scripts/token-benchmark.ts @@ -405,6 +405,126 @@ async function runPerfBenchmarks(nextjsDir) { }; } +// ── Issue experiment ────────────────────────────────────────────────────── + +/** Run RUNS sessions for one mode, logging per-run metrics. */ +async function runSessionsForMode(mode, issue, nextjsDir) { + const runs = []; + const label = mode === 'baseline' ? 'Baseline' : 'Codegraph'; + for (let r = 0; r < RUNS; r++) { + console.error(` ${label} run ${r + 1}/${RUNS}...`); + try { + const metrics = await runSession(mode, issue, nextjsDir); + runs.push(metrics); + console.error( + ` ${metrics.inputTokens} input tokens, $${metrics.totalCostUsd}, ` + + `${metrics.numTurns} turns, hit rate: ${metrics.hitRate}%`, + ); + } catch (err) { + console.error(` ERROR: ${err.message}`); + runs.push({ error: err.message }); + } + } + return runs; +} + +/** Compute median metrics for a run-set (or null when no valid runs). */ +function medianForRuns(runs) { + const valid = runs.filter((r) => !r.error); + if (valid.length === 0) return null; + const medianOf = (key) => median(valid.map((r) => r[key])); + return { + inputTokens: medianOf('inputTokens'), + outputTokens: medianOf('outputTokens'), + cacheReadInputTokens: medianOf('cacheReadInputTokens'), + totalCostUsd: round2(medianOf('totalCostUsd')), + numTurns: medianOf('numTurns'), + durationMs: medianOf('durationMs'), + uniqueFilesRead: medianOf('uniqueFilesRead'), + hitRate: medianOf('hitRate'), + }; +} + +/** Token + cost savings (% reduction) between two median objects. */ +function computeSavings(baselineMedian, codegraphMedian) { + if (!baselineMedian || !codegraphMedian || baselineMedian.inputTokens <= 0) return null; + const tokenSavings = + ((baselineMedian.inputTokens - codegraphMedian.inputTokens) / + baselineMedian.inputTokens) * + 100; + const costSavings = + baselineMedian.totalCostUsd > 0 + ? ((baselineMedian.totalCostUsd - codegraphMedian.totalCostUsd) / + baselineMedian.totalCostUsd) * + 100 + : 0; + return { + inputTokensPct: Math.round(tokenSavings), + costPct: Math.round(costSavings), + }; +} + +/** Run baseline + codegraph experiments for a single issue and aggregate. */ +async function runIssueExperiment(issue, nextjsDir) { + console.error(`\n── ${issue.id} (${issue.difficulty}) ──`); + console.error(`PR #${issue.pr}: ${issue.title}`); + + checkoutCommit(nextjsDir, issue.commitBefore); + if (!SKIP_GRAPH) { + await buildCodegraph(nextjsDir); + } + + const baselineRuns = await runSessionsForMode('baseline', issue, nextjsDir); + const codegraphRuns = await runSessionsForMode('codegraph', issue, nextjsDir); + + const baselineMedian = medianForRuns(baselineRuns); + const codegraphMedian = medianForRuns(codegraphRuns); + const savings = computeSavings(baselineMedian, codegraphMedian); + + if (savings) { + console.error( + ` Savings: ${savings.inputTokensPct}% tokens, ${savings.costPct}% cost`, + ); + } + + return { + id: issue.id, + difficulty: issue.difficulty, + pr: issue.pr, + baseline: { median: baselineMedian, runs: baselineRuns }, + codegraph: { median: codegraphMedian, runs: codegraphRuns }, + savings, + }; +} + +/** Aggregate per-issue results into corpus-wide token/cost savings + hit rates. */ +function computeAggregate(results) { + const validResults = results.filter( + (r) => r.baseline.median && r.codegraph.median && r.savings, + ); + if (validResults.length === 0) return null; + + const sum = (sel) => validResults.reduce((s, r) => s + sel(r), 0); + const totalBaselineTokens = sum((r) => r.baseline.median.inputTokens); + const totalCodegraphTokens = sum((r) => r.codegraph.median.inputTokens); + const totalBaselineCost = sum((r) => r.baseline.median.totalCostUsd); + const totalCodegraphCost = sum((r) => r.codegraph.median.totalCostUsd); + const pct = (a, b) => (a > 0 ? Math.round(((a - b) / a) * 100) : 0); + + return { + savings: { + inputTokensPct: pct(totalBaselineTokens, totalCodegraphTokens), + costPct: pct(totalBaselineCost, totalCodegraphCost), + }, + baselineAvgHitRate: Math.round( + sum((r) => r.baseline.median.hitRate) / validResults.length, + ), + codegraphAvgHitRate: Math.round( + sum((r) => r.codegraph.median.hitRate) / validResults.length, + ), + }; +} + // ── Main ────────────────────────────────────────────────────────────────── async function main() { @@ -422,179 +542,14 @@ async function main() { console.error(` Next.js dir: ${nextjsDir}`); console.error(''); - // Clone / fetch Next.js await ensureNextjsClone(nextjsDir); const results = []; - for (const issue of selectedIssues) { - console.error(`\n── ${issue.id} (${issue.difficulty}) ──`); - console.error(`PR #${issue.pr}: ${issue.title}`); - - // Checkout the commit before the fix - checkoutCommit(nextjsDir, issue.commitBefore); - - // Build codegraph (unless skipped) - if (!SKIP_GRAPH) { - await buildCodegraph(nextjsDir); - } - - const baselineRuns = []; - const codegraphRuns = []; - - // Run baseline sessions - for (let r = 0; r < RUNS; r++) { - console.error(` Baseline run ${r + 1}/${RUNS}...`); - try { - const metrics = await runSession('baseline', issue, nextjsDir); - baselineRuns.push(metrics); - console.error( - ` ${metrics.inputTokens} input tokens, $${metrics.totalCostUsd}, ` + - `${metrics.numTurns} turns, hit rate: ${metrics.hitRate}%`, - ); - } catch (err) { - console.error(` ERROR: ${err.message}`); - baselineRuns.push({ error: err.message }); - } - } - - // Run codegraph sessions - for (let r = 0; r < RUNS; r++) { - console.error(` Codegraph run ${r + 1}/${RUNS}...`); - try { - const metrics = await runSession('codegraph', issue, nextjsDir); - codegraphRuns.push(metrics); - console.error( - ` ${metrics.inputTokens} input tokens, $${metrics.totalCostUsd}, ` + - `${metrics.numTurns} turns, hit rate: ${metrics.hitRate}%`, - ); - } catch (err) { - console.error(` ERROR: ${err.message}`); - codegraphRuns.push({ error: err.message }); - } - } - - // Compute medians (filter out errored runs) - const validBaseline = baselineRuns.filter((r) => !r.error); - const validCodegraph = codegraphRuns.filter((r) => !r.error); - - const medianOf = (runs, key) => median(runs.map((r) => r[key])); - - const baselineMedian = - validBaseline.length > 0 - ? { - inputTokens: medianOf(validBaseline, 'inputTokens'), - outputTokens: medianOf(validBaseline, 'outputTokens'), - cacheReadInputTokens: medianOf(validBaseline, 'cacheReadInputTokens'), - totalCostUsd: round2(medianOf(validBaseline, 'totalCostUsd')), - numTurns: medianOf(validBaseline, 'numTurns'), - durationMs: medianOf(validBaseline, 'durationMs'), - uniqueFilesRead: medianOf(validBaseline, 'uniqueFilesRead'), - hitRate: medianOf(validBaseline, 'hitRate'), - } - : null; - - const codegraphMedian = - validCodegraph.length > 0 - ? { - inputTokens: medianOf(validCodegraph, 'inputTokens'), - outputTokens: medianOf(validCodegraph, 'outputTokens'), - cacheReadInputTokens: medianOf(validCodegraph, 'cacheReadInputTokens'), - totalCostUsd: round2(medianOf(validCodegraph, 'totalCostUsd')), - numTurns: medianOf(validCodegraph, 'numTurns'), - durationMs: medianOf(validCodegraph, 'durationMs'), - uniqueFilesRead: medianOf(validCodegraph, 'uniqueFilesRead'), - hitRate: medianOf(validCodegraph, 'hitRate'), - } - : null; - - // Compute savings - let savings = null; - if (baselineMedian && codegraphMedian && baselineMedian.inputTokens > 0) { - const tokenSavings = - ((baselineMedian.inputTokens - codegraphMedian.inputTokens) / - baselineMedian.inputTokens) * - 100; - const costSavings = - baselineMedian.totalCostUsd > 0 - ? ((baselineMedian.totalCostUsd - codegraphMedian.totalCostUsd) / - baselineMedian.totalCostUsd) * - 100 - : 0; - savings = { - inputTokensPct: Math.round(tokenSavings), - costPct: Math.round(costSavings), - }; - } - - results.push({ - id: issue.id, - difficulty: issue.difficulty, - pr: issue.pr, - baseline: { median: baselineMedian, runs: baselineRuns }, - codegraph: { median: codegraphMedian, runs: codegraphRuns }, - savings, - }); - - if (savings) { - console.error( - ` Savings: ${savings.inputTokensPct}% tokens, ${savings.costPct}% cost`, - ); - } + results.push(await runIssueExperiment(issue, nextjsDir)); } - // ── Aggregate ─────────────────────────────────────────────────────── - - const validResults = results.filter( - (r) => r.baseline.median && r.codegraph.median && r.savings, - ); - - let aggregate = null; - if (validResults.length > 0) { - const totalBaselineTokens = validResults.reduce( - (s, r) => s + r.baseline.median.inputTokens, - 0, - ); - const totalCodegraphTokens = validResults.reduce( - (s, r) => s + r.codegraph.median.inputTokens, - 0, - ); - const totalBaselineCost = validResults.reduce( - (s, r) => s + r.baseline.median.totalCostUsd, - 0, - ); - const totalCodegraphCost = validResults.reduce( - (s, r) => s + r.codegraph.median.totalCostUsd, - 0, - ); - - aggregate = { - savings: { - inputTokensPct: - totalBaselineTokens > 0 - ? Math.round( - ((totalBaselineTokens - totalCodegraphTokens) / totalBaselineTokens) * 100, - ) - : 0, - costPct: - totalBaselineCost > 0 - ? Math.round( - ((totalBaselineCost - totalCodegraphCost) / totalBaselineCost) * 100, - ) - : 0, - }, - baselineAvgHitRate: Math.round( - validResults.reduce((s, r) => s + r.baseline.median.hitRate, 0) / - validResults.length, - ), - codegraphAvgHitRate: Math.round( - validResults.reduce((s, r) => s + r.codegraph.median.hitRate, 0) / - validResults.length, - ), - }; - } - - // ── Performance benchmarks (optional) ──────────────────────────────── + const aggregate = computeAggregate(results); let perfBenchmarks = null; if (RUN_PERF) { @@ -603,8 +558,6 @@ async function main() { perfBenchmarks = await runPerfBenchmarks(nextjsDir); } - // ── Output ────────────────────────────────────────────────────────── - // Restore console.log for JSON output console.log = origLog; From 02efaeb1de4f164e5368874fb228b6c4d642e0b5 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 14:15:24 -0600 Subject: [PATCH 23/27] refactor(features): reduce warning-level complexity in feature warnings batch --- src/features/boundaries.ts | 110 ++++++++++++++---------- src/features/check.ts | 87 +++++++++++-------- src/features/dataflow.ts | 91 ++++++++++++-------- src/features/flow.ts | 72 ++++++++++------ src/features/sequence.ts | 168 ++++++++++++++++++++++--------------- 5 files changed, 319 insertions(+), 209 deletions(-) diff --git a/src/features/boundaries.ts b/src/features/boundaries.ts index 05f7738f8..0857ffc60 100644 --- a/src/features/boundaries.ts +++ b/src/features/boundaries.ts @@ -235,30 +235,23 @@ interface EvaluateBoundariesOpts { noTests?: boolean; } -export function evaluateBoundaries( - db: BetterSqlite3Database, - boundaryConfig: BoundaryConfig | undefined, - opts: EvaluateBoundariesOpts = {}, -): { violations: BoundaryViolation[]; violationCount: number } { - if (!boundaryConfig) return { violations: [], violationCount: 0 }; - - const { valid, errors } = validateBoundaryConfig(boundaryConfig); - if (!valid) { - throw new BoundaryError(`Invalid boundary configuration: ${errors.join('; ')}`); - } - - const modules = resolveModules(boundaryConfig); - if (modules.size === 0) return { violations: [], violationCount: 0 }; - - let allRules: BoundaryRule[] = []; - if (boundaryConfig.preset) { - allRules = generatePresetRules(modules, boundaryConfig.preset); - } +function collectAllRules( + boundaryConfig: BoundaryConfig, + modules: Map, +): BoundaryRule[] { + const rules: BoundaryRule[] = boundaryConfig.preset + ? generatePresetRules(modules, boundaryConfig.preset) + : []; if (boundaryConfig.rules && Array.isArray(boundaryConfig.rules)) { - allRules = allRules.concat(boundaryConfig.rules); + return rules.concat(boundaryConfig.rules); } - if (allRules.length === 0) return { violations: [], violationCount: 0 }; + return rules; +} +function loadImportEdges( + db: BetterSqlite3Database, + opts: EvaluateBoundariesOpts, +): Array<{ source: string; target: string }> { let edges: Array<{ source: string; target: string }>; try { edges = db @@ -281,38 +274,63 @@ export function evaluateBoundaries( const scope = new Set(opts.scopeFiles); edges = edges.filter((e) => scope.has(e.source)); } + return edges; +} - const violations: BoundaryViolation[] = []; +function ruleViolated(rule: BoundaryRule, toModule: string): boolean { + if (rule.notTo?.includes(toModule)) return true; + if (rule.onlyTo && !rule.onlyTo.includes(toModule)) return true; + return false; +} - for (const edge of edges) { - const fromModule = classifyFile(edge.source, modules); - const toModule = classifyFile(edge.target, modules); +function emitEdgeViolations( + edge: { source: string; target: string }, + fromModule: string, + toModule: string, + allRules: BoundaryRule[], + violations: BoundaryViolation[], +): void { + for (const rule of allRules) { + if (rule.from !== fromModule) continue; + if (!ruleViolated(rule, toModule)) continue; + violations.push({ + rule: 'boundaries', + name: `${fromModule} -> ${toModule}`, + file: edge.source, + targetFile: edge.target, + message: rule.message || `${fromModule} must not depend on ${toModule}`, + value: 1, + threshold: 0, + }); + } +} - if (!fromModule || !toModule) continue; +export function evaluateBoundaries( + db: BetterSqlite3Database, + boundaryConfig: BoundaryConfig | undefined, + opts: EvaluateBoundariesOpts = {}, +): { violations: BoundaryViolation[]; violationCount: number } { + if (!boundaryConfig) return { violations: [], violationCount: 0 }; - for (const rule of allRules) { - if (rule.from !== fromModule) continue; + const { valid, errors } = validateBoundaryConfig(boundaryConfig); + if (!valid) { + throw new BoundaryError(`Invalid boundary configuration: ${errors.join('; ')}`); + } - let isViolation = false; + const modules = resolveModules(boundaryConfig); + if (modules.size === 0) return { violations: [], violationCount: 0 }; - if (rule.notTo?.includes(toModule)) { - isViolation = true; - } else if (rule.onlyTo && !rule.onlyTo.includes(toModule)) { - isViolation = true; - } + const allRules = collectAllRules(boundaryConfig, modules); + if (allRules.length === 0) return { violations: [], violationCount: 0 }; - if (isViolation) { - violations.push({ - rule: 'boundaries', - name: `${fromModule} -> ${toModule}`, - file: edge.source, - targetFile: edge.target, - message: rule.message || `${fromModule} must not depend on ${toModule}`, - value: 1, - threshold: 0, - }); - } - } + const edges = loadImportEdges(db, opts); + const violations: BoundaryViolation[] = []; + + for (const edge of edges) { + const fromModule = classifyFile(edge.source, modules); + const toModule = classifyFile(edge.target, modules); + if (!fromModule || !toModule) continue; + emitEdgeViolations(edge, fromModule, toModule, allRules, violations); } return { violations, violationCount: violations.length }; diff --git a/src/features/check.ts b/src/features/check.ts index 289022800..a9baf6634 100644 --- a/src/features/check.ts +++ b/src/features/check.ts @@ -22,6 +22,29 @@ interface ParsedDiff { newFiles: Set; } +const HUNK_RE = /^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@/; +const NEW_FILE_RE = /^\+\+\+ b\/(.+)/; + +function pushHunkRanges( + line: string, + currentFile: string, + changedRanges: Map, + oldRanges: Map, +): void { + const hunkMatch = line.match(HUNK_RE); + if (!hunkMatch) return; + const oldStart = parseInt(hunkMatch[1]!, 10); + const oldCount = parseInt(hunkMatch[2] || '1', 10); + if (oldCount > 0) { + oldRanges.get(currentFile)!.push({ start: oldStart, end: oldStart + oldCount - 1 }); + } + const newStart = parseInt(hunkMatch[3]!, 10); + const newCount = parseInt(hunkMatch[4] || '1', 10); + if (newCount > 0) { + changedRanges.get(currentFile)!.push({ start: newStart, end: newStart + newCount - 1 }); + } +} + export function parseDiffOutput(diffOutput: string): ParsedDiff { const changedRanges = new Map(); const oldRanges = new Map(); @@ -38,7 +61,7 @@ export function parseDiffOutput(diffOutput: string): ParsedDiff { prevIsDevNull = false; continue; } - const fileMatch = line.match(/^\+\+\+ b\/(.+)/); + const fileMatch = line.match(NEW_FILE_RE); if (fileMatch) { currentFile = fileMatch[1]!; if (!changedRanges.has(currentFile)) changedRanges.set(currentFile, []); @@ -47,19 +70,7 @@ export function parseDiffOutput(diffOutput: string): ParsedDiff { prevIsDevNull = false; continue; } - const hunkMatch = line.match(/^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@/); - if (hunkMatch && currentFile) { - const oldStart = parseInt(hunkMatch[1]!, 10); - const oldCount = parseInt(hunkMatch[2] || '1', 10); - if (oldCount > 0) { - oldRanges.get(currentFile)!.push({ start: oldStart, end: oldStart + oldCount - 1 }); - } - const newStart = parseInt(hunkMatch[3]!, 10); - const newCount = parseInt(hunkMatch[4] || '1', 10); - if (newCount > 0) { - changedRanges.get(currentFile)!.push({ start: newStart, end: newStart + newCount - 1 }); - } - } + if (currentFile) pushHunkRanges(line, currentFile, changedRanges, oldRanges); } return { changedRanges, oldRanges, newFiles }; } @@ -96,6 +107,26 @@ interface BlastRadiusResult { violations: BlastRadiusViolation[]; } +type DefRow = { + id: number; + name: string; + kind: string; + file: string; + line: number; + end_line: number | null; +}; + +function rangesOverlap(defLine: number, endLine: number, ranges: DiffRange[]): boolean { + for (const range of ranges) { + if (range.start <= endLine && range.end >= defLine) return true; + } + return false; +} + +function defEndLine(def: DefRow, nextDef: DefRow | undefined): number { + return def.end_line || (nextDef ? nextDef.line - 1 : 999999); +} + export function checkMaxBlastRadius( db: BetterSqlite3Database, changedRanges: Map, @@ -105,34 +136,18 @@ export function checkMaxBlastRadius( ): BlastRadiusResult { const violations: BlastRadiusViolation[] = []; let maxFound = 0; + const defsStmt = db.prepare( + `SELECT * FROM nodes WHERE file = ? AND kind IN ('function', 'method', 'class') ORDER BY line`, + ); for (const [file, ranges] of changedRanges) { if (noTests && isTestFile(file)) continue; - const defs = db - .prepare( - `SELECT * FROM nodes WHERE file = ? AND kind IN ('function', 'method', 'class') ORDER BY line`, - ) - .all(file) as Array<{ - id: number; - name: string; - kind: string; - file: string; - line: number; - end_line: number | null; - }>; + const defs = defsStmt.all(file) as DefRow[]; for (let i = 0; i < defs.length; i++) { const def = defs[i]!; - const nextDef = defs[i + 1]; - const endLine = def.end_line || (nextDef ? nextDef.line - 1 : 999999); - let overlaps = false; - for (const range of ranges) { - if (range.start <= endLine && range.end >= def.line) { - overlaps = true; - break; - } - } - if (!overlaps) continue; + const endLine = defEndLine(def, defs[i + 1]); + if (!rangesOverlap(def.line, endLine, ranges)) continue; const { totalDependents: totalCallers } = bfsTransitiveCallers(db, def.id, { noTests, diff --git a/src/features/dataflow.ts b/src/features/dataflow.ts index d85bcb668..804e7aa1e 100644 --- a/src/features/dataflow.ts +++ b/src/features/dataflow.ts @@ -675,6 +675,51 @@ interface BfsParentEntry { expression: string; } +type DataflowNeighbor = { + id: number; + file: string; + edge_kind: string; + expression: string; +}; + +interface DataflowBfsState { + visited: Set; + parent: Map; + nextQueue: number[]; + found: boolean; +} + +/** + * Process a single neighbor in the dataflow BFS. Returns true once the target + * has been reached so the caller can stop expanding. + */ +function processDataflowNeighbor( + n: DataflowNeighbor, + currentId: number, + targetId: number, + noTests: boolean, + state: DataflowBfsState, +): boolean { + if (noTests && isTestFile(n.file)) return false; + const entry: BfsParentEntry = { + parentId: currentId, + edgeKind: n.edge_kind, + expression: n.expression, + }; + if (n.id === targetId) { + if (!state.found) { + state.found = true; + state.parent.set(n.id, entry); + } + return true; + } + if (state.visited.has(n.id)) return false; + state.visited.add(n.id); + state.parent.set(n.id, entry); + state.nextQueue.push(n.id); + return false; +} + /** BFS through dataflow edges to find a path from source to target. */ function bfsDataflowPath( db: BetterSqlite3Database, @@ -689,50 +734,28 @@ function bfsDataflowPath( WHERE d.source_id = ? AND d.kind IN ('flows_to', 'returns')`, ); - const visited = new Set([sourceId]); - const parent = new Map(); + const state: DataflowBfsState = { + visited: new Set([sourceId]), + parent: new Map(), + nextQueue: [], + found: false, + }; let queue = [sourceId]; - let found = false; for (let depth = 1; depth <= maxDepth; depth++) { - const nextQueue: number[] = []; + state.nextQueue = []; for (const currentId of queue) { - const neighbors = neighborStmt.all(currentId) as Array<{ - id: number; - file: string; - edge_kind: string; - expression: string; - }>; + const neighbors = neighborStmt.all(currentId) as DataflowNeighbor[]; for (const n of neighbors) { - if (noTests && isTestFile(n.file)) continue; - if (n.id === targetId) { - if (!found) { - found = true; - parent.set(n.id, { - parentId: currentId, - edgeKind: n.edge_kind, - expression: n.expression, - }); - } - continue; - } - if (!visited.has(n.id)) { - visited.add(n.id); - parent.set(n.id, { - parentId: currentId, - edgeKind: n.edge_kind, - expression: n.expression, - }); - nextQueue.push(n.id); - } + processDataflowNeighbor(n, currentId, targetId, noTests, state); } } - if (found) break; - queue = nextQueue; + if (state.found) break; + queue = state.nextQueue; if (queue.length === 0) break; } - return found ? parent : null; + return state.found ? state.parent : null; } /** Reconstruct a path from BFS parent map. */ diff --git a/src/features/flow.ts b/src/features/flow.ts index 18c522157..e2a4f1f21 100644 --- a/src/features/flow.ts +++ b/src/features/flow.ts @@ -133,6 +133,41 @@ interface BfsState { truncated: boolean; } +interface FlowBfsFrame { + visited: Set; + cycles: Array<{ from: string; to: string; depth: number }>; + nodeDepths: Map; + idToNode: Map; + nextFrontier: number[]; + levelNodes: NodeInfo[]; +} + +/** Process one callee row, recording cycle hits or expanding frontier. */ +function processFlowCallee( + c: CalleeRow, + fid: number, + depth: number, + noTests: boolean, + frame: FlowBfsFrame, +): void { + if (noTests && isTestFile(c.file)) return; + + if (frame.visited.has(c.id)) { + const fromNode = frame.idToNode.get(fid); + if (fromNode) { + frame.cycles.push({ from: fromNode.name, to: c.name, depth }); + } + return; + } + + frame.visited.add(c.id); + frame.nextFrontier.push(c.id); + const nodeInfo: NodeInfo = toSymbolRef(c); + frame.levelNodes.push(nodeInfo); + frame.nodeDepths.set(c.id, depth); + frame.idToNode.set(c.id, nodeInfo); +} + /** Forward BFS through callees, collecting steps, cycles, and node depth info. */ function bfsCallees( db: ReturnType, @@ -157,37 +192,26 @@ function bfsCallees( ); for (let d = 1; d <= maxDepth; d++) { - const nextFrontier: number[] = []; - const levelNodes: NodeInfo[] = []; + const frame: FlowBfsFrame = { + visited, + cycles, + nodeDepths, + idToNode, + nextFrontier: [], + levelNodes: [], + }; for (const fid of frontier) { - const callees = calleesStmt.all(fid); - - for (const c of callees) { - if (noTests && isTestFile(c.file)) continue; - - if (visited.has(c.id)) { - const fromNode = idToNode.get(fid); - if (fromNode) { - cycles.push({ from: fromNode.name, to: c.name, depth: d }); - } - continue; - } - - visited.add(c.id); - nextFrontier.push(c.id); - const nodeInfo: NodeInfo = toSymbolRef(c); - levelNodes.push(nodeInfo); - nodeDepths.set(c.id, d); - idToNode.set(c.id, nodeInfo); + for (const c of calleesStmt.all(fid)) { + processFlowCallee(c, fid, d, noTests, frame); } } - if (levelNodes.length > 0) { - steps.push({ depth: d, nodes: levelNodes }); + if (frame.levelNodes.length > 0) { + steps.push({ depth: d, nodes: frame.levelNodes }); } - frontier = nextFrontier; + frontier = frame.nextFrontier; if (frontier.length === 0) break; if (d === maxDepth && frontier.length > 0) truncated = true; } diff --git a/src/features/sequence.ts b/src/features/sequence.ts index aa891d78b..db2db7fb2 100644 --- a/src/features/sequence.ts +++ b/src/features/sequence.ts @@ -91,6 +91,40 @@ interface BfsResult { truncated: boolean; } +type CalleeNode = { id: number; name: string; file: string; kind: string; line: number }; + +interface BfsFrame { + visited: Set; + messages: SequenceMessage[]; + fileSet: Set; + idToNode: Map; + nextFrontier: number[]; +} + +function processCallee( + c: CalleeNode, + caller: CalleeNode, + depth: number, + noTests: boolean, + frame: BfsFrame, +): void { + if (noTests && isTestFile(c.file)) return; + + frame.fileSet.add(c.file); + frame.messages.push({ + from: caller.file, + to: c.file, + label: c.name, + type: 'call', + depth, + }); + + if (frame.visited.has(c.id)) return; + frame.visited.add(c.id); + frame.nextFrontier.push(c.id); + frame.idToNode.set(c.id, c); +} + function bfsCallees( repo: Repository, matchNode: MatchNode, @@ -101,46 +135,25 @@ function bfsCallees( let frontier = [matchNode.id]; const messages: SequenceMessage[] = []; const fileSet = new Set([matchNode.file]); - const idToNode = new Map< - number, - { id: number; name: string; file: string; kind: string; line: number } - >(); + const idToNode = new Map(); idToNode.set(matchNode.id, matchNode); let truncated = false; for (let d = 1; d <= maxDepth; d++) { - const nextFrontier: number[] = []; + const frame: BfsFrame = { visited, messages, fileSet, idToNode, nextFrontier: [] }; for (const fid of frontier) { - const callees = repo.findCallees(fid); const caller = idToNode.get(fid)!; - - for (const c of callees) { - if (noTests && isTestFile(c.file)) continue; - - fileSet.add(c.file); - messages.push({ - from: caller.file, - to: c.file, - label: c.name, - type: 'call', - depth: d, - }); - - if (visited.has(c.id)) continue; - - visited.add(c.id); - nextFrontier.push(c.id); - idToNode.set(c.id, c); + for (const c of repo.findCallees(fid)) { + processCallee(c, caller, d, noTests, frame); } } - frontier = nextFrontier; + frontier = frame.nextFrontier; if (frontier.length === 0) break; - if (d === maxDepth && frontier.length > 0) { - const hasMoreCalls = frontier.some((fid) => repo.findCallees(fid).length > 0); - if (hasMoreCalls) truncated = true; + if (d === maxDepth && frontier.some((fid) => repo.findCallees(fid).length > 0)) { + truncated = true; } } @@ -174,26 +187,16 @@ function annotateDataflow( } } -function _annotateDataflowImpl( - db: BetterSqlite3Database, +type DataflowStmts = { + getReturns: ReturnType; + getFlowsTo: ReturnType; +}; + +function appendReturnMessages( messages: SequenceMessage[], - idToNode: Map, + nodeByNameFile: Map, + stmts: DataflowStmts, ): void { - const nodeByNameFile = new Map(); - for (const n of idToNode.values()) { - nodeByNameFile.set(`${n.name}|${n.file}`, n); - } - - const getReturns = db.prepare( - `SELECT d.expression FROM dataflow d - WHERE d.source_id = ? AND d.kind = 'returns'`, - ); - const getFlowsTo = db.prepare( - `SELECT d.expression FROM dataflow d - WHERE d.target_id = ? AND d.kind = 'flows_to' - ORDER BY d.param_index`, - ); - const seenReturns = new Set(); for (const msg of [...messages]) { if (msg.type !== 'call') continue; @@ -203,40 +206,67 @@ function _annotateDataflowImpl( const returnKey = `${msg.to}->${msg.from}:${msg.label}`; if (seenReturns.has(returnKey)) continue; - const returns = getReturns.all(targetNode.id) as { expression: string }[]; - - if (returns.length > 0) { - seenReturns.add(returnKey); - const expr = returns[0]!.expression || 'result'; - messages.push({ - from: msg.to, - to: msg.from, - label: expr, - type: 'return', - depth: msg.depth, - }); - } + const returns = stmts.getReturns.all(targetNode.id) as { expression: string }[]; + if (returns.length === 0) continue; + + seenReturns.add(returnKey); + messages.push({ + from: msg.to, + to: msg.from, + label: returns[0]!.expression || 'result', + type: 'return', + depth: msg.depth, + }); } +} +function annotateCallParams( + messages: SequenceMessage[], + nodeByNameFile: Map, + stmts: DataflowStmts, +): void { for (const msg of messages) { if (msg.type !== 'call') continue; const targetNode = nodeByNameFile.get(`${msg.label}|${msg.to}`); if (!targetNode) continue; - const params = getFlowsTo.all(targetNode.id) as { expression: string }[]; - - if (params.length > 0) { - const paramNames = params - .map((p) => p.expression) - .filter(Boolean) - .slice(0, 3); - if (paramNames.length > 0) { - msg.label = `${msg.label}(${paramNames.join(', ')})`; - } + const params = stmts.getFlowsTo.all(targetNode.id) as { expression: string }[]; + const paramNames = params + .map((p) => p.expression) + .filter(Boolean) + .slice(0, 3); + if (paramNames.length > 0) { + msg.label = `${msg.label}(${paramNames.join(', ')})`; } } } +function _annotateDataflowImpl( + db: BetterSqlite3Database, + messages: SequenceMessage[], + idToNode: Map, +): void { + const nodeByNameFile = new Map(); + for (const n of idToNode.values()) { + nodeByNameFile.set(`${n.name}|${n.file}`, n); + } + + const stmts: DataflowStmts = { + getReturns: db.prepare( + `SELECT d.expression FROM dataflow d + WHERE d.source_id = ? AND d.kind = 'returns'`, + ), + getFlowsTo: db.prepare( + `SELECT d.expression FROM dataflow d + WHERE d.target_id = ? AND d.kind = 'flows_to' + ORDER BY d.param_index`, + ), + }; + + appendReturnMessages(messages, nodeByNameFile, stmts); + annotateCallParams(messages, nodeByNameFile, stmts); +} + interface Participant { id: string; label: string; From 9182a52d12bda8b0375fc74ef393efcf497ce726 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 16:26:42 -0600 Subject: [PATCH 24/27] refactor(extractors): adopt iterChildren + PUNCTUATION_TOKENS in elixir pushElixirSequenceItems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the inline childCount loop with the shared iterChildren generator configured with PUNCTUATION_TOKENS, completing phase 1 of the TS extractor refactor plan (sync.json cluster 1). Behaviour preserved — same nodes are pushed onto the worklist, just via the shared helper. docs check acknowledged: internal refactor, no doc updates needed. --- src/extractors/elixir.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/extractors/elixir.ts b/src/extractors/elixir.ts index 1b547645c..bc3a6ede8 100644 --- a/src/extractors/elixir.ts +++ b/src/extractors/elixir.ts @@ -5,7 +5,7 @@ import type { TreeSitterNode, TreeSitterTree, } from '../types.js'; -import { findChild, nodeEndLine } from './helpers.js'; +import { findChild, iterChildren, nodeEndLine, PUNCTUATION_TOKENS } from './helpers.js'; /** * Extract symbols from Elixir files. @@ -256,11 +256,7 @@ function pushElixirBinaryOperatorOperands(node: TreeSitterNode, stack: TreeSitte * the worklist, skipping punctuation tokens. */ function pushElixirSequenceItems(node: TreeSitterNode, stack: TreeSitterNode[]): void { - for (let i = 0; i < node.childCount; i++) { - const c = node.child(i); - if (!c) continue; - const t = c.type; - if (t === '[' || t === ']' || t === '{' || t === '}' || t === ',') continue; + for (const c of iterChildren(node, PUNCTUATION_TOKENS)) { stack.push(c); } } From e6ea3ed2b82326ee0886e9c77adac09f0107ff3f Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Tue, 26 May 2026 16:36:29 -0600 Subject: [PATCH 25/27] refactor(extractors-rs): adopt shared child-iteration helpers (grind) Wire forge phase 4 helpers into their consumers: - find_first_child_of_types: collapse find_child(x, A).or_else(|| find_child(x, B)) in fsharp.rs handle_application - iter_children + PUNCTUATION_TOKENS: replace inline punctuation-skip loop in javascript.rs first_arg_is_string_literal Closes 3 dead-ffi helpers extracted by forge phase 4. Semantically identical. --- crates/codegraph-core/src/extractors/fsharp.rs | 4 ++-- crates/codegraph-core/src/extractors/javascript.rs | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/codegraph-core/src/extractors/fsharp.rs b/crates/codegraph-core/src/extractors/fsharp.rs index 752ffb05f..67892831a 100644 --- a/crates/codegraph-core/src/extractors/fsharp.rs +++ b/crates/codegraph-core/src/extractors/fsharp.rs @@ -321,8 +321,8 @@ fn handle_application(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // matches the JS extractor (`identifier` first). Operator forms // like `( + )` have neither child; we emit nothing in that case, // mirroring the JS extractor's silent skip. - if let Some(inner) = find_child(&func_node, "identifier") - .or_else(|| find_child(&func_node, "long_identifier")) + if let Some(inner) = + find_first_child_of_types(&func_node, &["identifier", "long_identifier"]) { symbols.calls.push(Call { name: node_text(&inner, source).to_string(), diff --git a/crates/codegraph-core/src/extractors/javascript.rs b/crates/codegraph-core/src/extractors/javascript.rs index d5403aa0f..1091ce29b 100644 --- a/crates/codegraph-core/src/extractors/javascript.rs +++ b/crates/codegraph-core/src/extractors/javascript.rs @@ -933,11 +933,9 @@ fn extract_callee_name<'a>(call_node: &Node, source: &'a [u8]) -> Option<&'a str /// used to distinguish Express/router route handlers (`app.get('/path', h)`) /// from Map/cache APIs that reuse the same verb names (`cache.get(user.id)`). fn first_arg_is_string_literal(args_node: &Node) -> bool { - for i in 0..args_node.child_count() { - let Some(child) = args_node.child(i) else { continue }; + // Skip grammar punctuation; the first non-punctuation child is the first arg. + if let Some(child) = iter_children(args_node, PUNCTUATION_TOKENS).next() { let kind = child.kind(); - // Skip parens and commas; the first non-punctuation child is the first arg. - if kind == "(" || kind == "," || kind == ")" { continue; } return kind == "string" || kind == "template_string"; } false From 32a0c5cf2ed0992cc733bb54061c8fdeff0f1f5e Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 27 May 2026 22:28:16 -0600 Subject: [PATCH 26/27] fix(tests): move column-width comment to the .tsx entry that actually drives it (#1240) --- tests/parsers/native-drop-classification.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index d617d4757..0eb89c854 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -109,8 +109,8 @@ describe('formatDropExtensionSummary', () => { it('right-pads the extension column and right-aligns the count column for tabular layout', () => { const buckets = new Map([ - ['.kt', ['a.kt']], // 100 files later — wider count column - ['.tsx', new Array(100).fill('x.tsx')], + ['.kt', ['a.kt']], + ['.tsx', new Array(100).fill('x.tsx')], // 100 files — sets wider count column ]); const out = formatDropExtensionSummary(buckets); // `.tsx` (4 chars) sets the ext width; `.kt` is padded to 4 chars. From 9b0c04de47e1123560f151f5f0b2db479f2b0c00 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 28 May 2026 02:06:54 -0600 Subject: [PATCH 27/27] fix(elixir): restore LIFO-compensating reverse-push in sequence and map helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pushElixirSequenceItems and pushElixirMapValues were pushing items in forward order onto the LIFO worklist stack, causing tuple/list/map parameters to be emitted in reverse source order (e.g. {x, _y} → ['_y', 'x'] instead of ['x', '_y']). The fix collects items then pushes them in reverse so the LIFO pop restores source order, matching the native engine. --- src/extractors/elixir.ts | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/extractors/elixir.ts b/src/extractors/elixir.ts index bc3a6ede8..a5b3ef13c 100644 --- a/src/extractors/elixir.ts +++ b/src/extractors/elixir.ts @@ -256,8 +256,9 @@ function pushElixirBinaryOperatorOperands(node: TreeSitterNode, stack: TreeSitte * the worklist, skipping punctuation tokens. */ function pushElixirSequenceItems(node: TreeSitterNode, stack: TreeSitterNode[]): void { - for (const c of iterChildren(node, PUNCTUATION_TOKENS)) { - stack.push(c); + const items = [...iterChildren(node, PUNCTUATION_TOKENS)]; + for (let i = items.length - 1; i >= 0; i--) { + stack.push(items[i] as TreeSitterNode); } } @@ -267,6 +268,7 @@ function pushElixirSequenceItems(node: TreeSitterNode, stack: TreeSitterNode[]): * the leading `struct` child is intentionally skipped. */ function pushElixirMapValues(node: TreeSitterNode, stack: TreeSitterNode[]): void { + const values: TreeSitterNode[] = []; for (let i = 0; i < node.childCount; i++) { const content = node.child(i); if (!content || content.type !== 'map_content') continue; @@ -279,11 +281,14 @@ function pushElixirMapValues(node: TreeSitterNode, stack: TreeSitterNode[]): voi for (let p = 0; p < pair.childCount; p++) { const part = pair.child(p); if (!part || part.type === 'keyword') continue; - stack.push(part); + values.push(part); } } } } + for (let i = values.length - 1; i >= 0; i--) { + stack.push(values[i] as TreeSitterNode); + } } function handleDefprotocol(node: TreeSitterNode, ctx: ExtractorOutput): void {