diff --git a/packages/cli/src/commands/find-enclosing-symbol.test.ts b/packages/analysis/src/enclosing-symbol.test.ts similarity index 99% rename from packages/cli/src/commands/find-enclosing-symbol.test.ts rename to packages/analysis/src/enclosing-symbol.test.ts index 4db57ae5..194ec61f 100644 --- a/packages/cli/src/commands/find-enclosing-symbol.test.ts +++ b/packages/analysis/src/enclosing-symbol.test.ts @@ -6,7 +6,7 @@ import { findEnclosingSymbolId, indexNodesByFile, type NodeRow, -} from "./find-enclosing-symbol.js"; +} from "./enclosing-symbol.js"; function row( id: string, diff --git a/packages/analysis/src/enclosing-symbol.ts b/packages/analysis/src/enclosing-symbol.ts new file mode 100644 index 00000000..22055d8c --- /dev/null +++ b/packages/analysis/src/enclosing-symbol.ts @@ -0,0 +1,122 @@ +/** + * `findEnclosingSymbolId` — deterministic tightest-span lookup mapping a + * `(filePath, line)` pair back to the OpenCodeHub graph node that owns the + * line (a Function / Method / Class / …). + * + * Canonical home for an algorithm that was previously cloned in two places — + * `@opencodehub/cli`'s `ingest-sarif` (SARIF finding → enclosing symbol) and + * `@opencodehub/ingestion`'s `scip-index` (SCIP call site → enclosing symbol). + * Both now import from here. `@opencodehub/analysis` is the shared home + * because both `cli` and `ingestion` already depend on it (no new edge, no + * cycle). + * + * The two former clones differed only in their kind allow-set and their node + * source, so this module exposes the pure core parameterized by a kind-set + * plus the two named sets; each caller projects its own nodes into `NodeRow[]` + * and calls the shared index/lookup. + * + * 1-indexing note: SARIF 2.1.0 `region.startLine` and OpenCodeHub node + * `startLine`/`endLine` are both 1-based, so call sites pass lines through + * unadjusted. + */ + +import type { NodeId, NodeKind } from "@opencodehub/core-types"; + +/** A graph-node projection carrying only the fields the lookup needs. */ +export interface NodeRow { + readonly id: NodeId; + readonly filePath: string; + readonly startLine: number; + readonly endLine: number; + readonly kind: NodeKind; +} + +/** Per-file, start-line-ascending index used by {@link findEnclosingSymbolId}. */ +export type NodesByFile = ReadonlyMap; + +/** + * SARIF-linkage allow-set — a strict superset of {@link SCIP_SYMBOL_KINDS} + * that additionally admits `Constructor`, because SARIF tooling routinely + * emits findings inside constructor bodies. + */ +export const ENCLOSING_SYMBOL_KINDS: ReadonlySet = new Set([ + "Function", + "Method", + "Constructor", + "Class", + "Interface", + "Struct", + "Enum", + "Trait", +]); + +/** + * SCIP-derivation allow-set — the kinds the scip-index phase resolves call + * sites and definitions against. No `Constructor` (SCIP definition occurrences + * never land on a bare constructor in the indexers OpenCodeHub ships). + */ +export const SCIP_SYMBOL_KINDS: ReadonlySet = new Set([ + "Class", + "Method", + "Function", + "Interface", + "Struct", + "Enum", + "Trait", +]); + +/** + * Build a per-file, start-line-ascending index over `rows`, keeping only nodes + * whose `kind` is in `kinds` (default {@link ENCLOSING_SYMBOL_KINDS}) and that + * carry finite `startLine`/`endLine`. Within each file the array is sorted by + * `startLine` asc, `endLine` asc — the sort lets {@link findEnclosingSymbolId} + * early-break once it passes the target line. + */ +export function indexNodesByFile( + rows: readonly NodeRow[], + kinds: ReadonlySet = ENCLOSING_SYMBOL_KINDS, +): NodesByFile { + const map = new Map(); + for (const row of rows) { + if (!kinds.has(row.kind)) continue; + if (!Number.isFinite(row.startLine) || !Number.isFinite(row.endLine)) continue; + const bucket = map.get(row.filePath); + if (bucket === undefined) map.set(row.filePath, [row]); + else bucket.push(row); + } + for (const arr of map.values()) { + arr.sort((a, b) => { + if (a.startLine !== b.startLine) return a.startLine - b.startLine; + return a.endLine - b.endLine; + }); + } + return map; +} + +/** + * Return the id of the tightest-span node in `nodesByFile[filePath]` that + * encloses `line` (`startLine <= line <= endLine`). "Tightest" means smallest + * `endLine - startLine`, so a nested method wins over its containing class. + * Returns `undefined` when the file is unknown or no candidate contains the + * line. + */ +export function findEnclosingSymbolId( + nodesByFile: NodesByFile, + filePath: string, + line: number, +): NodeId | undefined { + const candidates = nodesByFile.get(filePath); + if (candidates === undefined) return undefined; + let best: NodeRow | undefined; + let bestSpan = Number.POSITIVE_INFINITY; + for (const rec of candidates) { + if (rec.startLine > line) break; + if (rec.endLine < line) continue; + const span = rec.endLine - rec.startLine; + if (span < bestSpan) { + best = rec; + bestSpan = span; + } + } + return best?.id; +} diff --git a/packages/analysis/src/index.ts b/packages/analysis/src/index.ts index 343ad515..d58a41c9 100644 --- a/packages/analysis/src/index.ts +++ b/packages/analysis/src/index.ts @@ -42,6 +42,13 @@ export { symbolKinds as deadCodeSymbolKinds, } from "./dead-code.js"; export { runDetectChanges } from "./detect-changes.js"; +export type { NodeRow, NodesByFile } from "./enclosing-symbol.js"; +export { + ENCLOSING_SYMBOL_KINDS, + findEnclosingSymbolId, + indexNodesByFile, + SCIP_SYMBOL_KINDS, +} from "./enclosing-symbol.js"; export { createNodeFs } from "./fs.js"; export { gitDiffHunks, diff --git a/packages/analysis/src/scan-enrich.test.ts b/packages/analysis/src/scan-enrich.test.ts index abb6b95f..8cf603d3 100644 --- a/packages/analysis/src/scan-enrich.test.ts +++ b/packages/analysis/src/scan-enrich.test.ts @@ -14,13 +14,20 @@ import type { SarifLog } from "@opencodehub/sarif"; import { buildScanEnrichment } from "./scan-enrich.js"; import { FakeStore } from "./test-utils.js"; -/** Minimal SARIF result with a primary-location uri + fingerprint. */ -function result(uri: string, fingerprint: string) { +/** Minimal SARIF result with a primary-location uri + fingerprint (+ line). */ +function result(uri: string, fingerprint: string, startLine?: number) { return { ruleId: "demo-rule", level: "warning", message: { text: "x" }, - locations: [{ physicalLocation: { artifactLocation: { uri } } }], + locations: [ + { + physicalLocation: { + artifactLocation: { uri }, + ...(startLine !== undefined ? { region: { startLine } } : {}), + }, + }, + ], partialFingerprints: { primaryLocationLineHash: fingerprint }, }; } @@ -49,7 +56,7 @@ test("buildScanEnrichment maps a result to its File node signals by fingerprint" assert.ok(byFp !== undefined, "byResultFingerprint must be present"); assert.deepEqual(byFp?.get("fp-a"), { busFactor: 2, temporalFixDensity: 0.5 }); // Run-level stamp is deterministic (no clock / run id). - assert.deepEqual(enrichment.run, { enrichmentVersion: "1", sources: ["graph"] }); + assert.deepEqual(enrichment.run, { enrichmentVersion: "2", sources: ["graph"] }); }); test("buildScanEnrichment normalizes an absolute result uri to the repo-relative node id", async () => { @@ -83,7 +90,7 @@ test("buildScanEnrichment omits results whose file has no materialized signals", const enrichment = await buildScanEnrichment(store, log, "/repo"); // No signals → no per-result map, but the run-level stamp still returns. assert.equal(enrichment.byResultFingerprint, undefined); - assert.deepEqual(enrichment.run, { enrichmentVersion: "1", sources: ["graph"] }); + assert.deepEqual(enrichment.run, { enrichmentVersion: "2", sources: ["graph"] }); }); test("buildScanEnrichment is byte-stable across two runs (no clock/run id)", async () => { @@ -106,5 +113,146 @@ test("buildScanEnrichment returns only the run stamp for an empty log", async () const store = new FakeStore(); const enrichment = await buildScanEnrichment(store, logWith([]), "/repo"); assert.equal(enrichment.byResultFingerprint, undefined); - assert.deepEqual(enrichment.run, { enrichmentVersion: "1", sources: ["graph"] }); + assert.deepEqual(enrichment.run, { enrichmentVersion: "2", sources: ["graph"] }); +}); + +// --------------------------------------------------------------------------- +// Symbol-level signals: blastRadius (upstream runImpact) + community. +// --------------------------------------------------------------------------- + +/** Add a File + an enclosing Function spanning lines 1-20 in one helper. */ +function addFileWithFn(store: FakeStore, file: string, fnId: string): void { + store.addNode({ id: `File:${file}:${file}`, kind: "File", name: file, filePath: file }); + store.addNode({ + id: fnId, + kind: "Function", + name: "target", + filePath: file, + startLine: 1, + endLine: 20, + }); +} + +test("buildScanEnrichment attaches blastRadius from the finding's enclosing symbol", async () => { + const store = new FakeStore(); + addFileWithFn(store, "src/a.ts", "Function:src/a.ts:target#0"); + // One caller → upstream blast radius of 1 for the target. + store.addNode({ + id: "Function:src/b.ts:caller#0", + kind: "Function", + name: "caller", + filePath: "src/b.ts", + startLine: 1, + endLine: 5, + }); + store.addEdge({ + fromId: "Function:src/b.ts:caller#0", + toId: "Function:src/a.ts:target#0", + type: "CALLS", + confidence: 0.9, + }); + + // Finding on line 10 → inside target (1-20). + const enrichment = await buildScanEnrichment( + store, + logWith([result("src/a.ts", "fp-x", 10)]), + "/repo", + ); + assert.equal(enrichment.byResultFingerprint?.get("fp-x")?.blastRadius, 1); +}); + +test("buildScanEnrichment attaches community label from MEMBER_OF", async () => { + const store = new FakeStore(); + addFileWithFn(store, "src/a.ts", "Function:src/a.ts:target#0"); + store.addNode({ + id: "Community:1", + kind: "Community", + name: "auth", + filePath: "", + inferredLabel: "auth-core", + }); + store.addEdge({ + fromId: "Function:src/a.ts:target#0", + toId: "Community:1", + type: "MEMBER_OF", + confidence: 1, + }); + + const enrichment = await buildScanEnrichment( + store, + logWith([result("src/a.ts", "fp-x", 10)]), + "/repo", + ); + assert.equal(enrichment.byResultFingerprint?.get("fp-x")?.community, "auth-core"); +}); + +test("buildScanEnrichment merges file + symbol signals on one result", async () => { + const store = new FakeStore(); + store.addNode({ + id: "File:src/a.ts:src/a.ts", + kind: "File", + name: "a.ts", + filePath: "src/a.ts", + busFactor: 3, + }); + store.addNode({ + id: "Function:src/a.ts:target#0", + kind: "Function", + name: "target", + filePath: "src/a.ts", + startLine: 1, + endLine: 20, + }); + store.addNode({ + id: "Community:1", + kind: "Community", + name: "auth", + filePath: "", + inferredLabel: "auth-core", + }); + store.addEdge({ + fromId: "Function:src/a.ts:target#0", + toId: "Community:1", + type: "MEMBER_OF", + confidence: 1, + }); + + const enrichment = await buildScanEnrichment( + store, + logWith([result("src/a.ts", "fp-x", 10)]), + "/repo", + ); + // busFactor (file) + community (symbol) + blastRadius 0 (symbol resolved, no + // callers — a real "nothing depends on this" signal, not "not computed"). + assert.deepEqual(enrichment.byResultFingerprint?.get("fp-x"), { + busFactor: 3, + blastRadius: 0, + community: "auth-core", + }); +}); + +test("buildScanEnrichment leaves a finding with no enclosing symbol at file signals only", async () => { + const store = new FakeStore(); + store.addNode({ + id: "File:src/a.ts:src/a.ts", + kind: "File", + name: "a.ts", + filePath: "src/a.ts", + busFactor: 2, + }); + store.addNode({ + id: "Function:src/a.ts:target#0", + kind: "Function", + name: "target", + filePath: "src/a.ts", + startLine: 1, + endLine: 5, + }); + // Finding on line 99 → outside the function → no symbol-level signals. + const enrichment = await buildScanEnrichment( + store, + logWith([result("src/a.ts", "fp-x", 99)]), + "/repo", + ); + assert.deepEqual(enrichment.byResultFingerprint?.get("fp-x"), { busFactor: 2 }); }); diff --git a/packages/analysis/src/scan-enrich.ts b/packages/analysis/src/scan-enrich.ts index 571b602f..9bfb2b4d 100644 --- a/packages/analysis/src/scan-enrich.ts +++ b/packages/analysis/src/scan-enrich.ts @@ -2,25 +2,50 @@ * `buildScanEnrichment` — derive graph signals for each SARIF scan result so * `enrichWithProperties` can stamp them under `properties.opencodehub.*`. * - * Maps every result to the File node for its primary location and reads the - * file-granular signals already materialized on that node by the ingestion - * temporal phases: bus factor and fix-follow-feat density (→ temporalFixDensity). + * Two tiers of signal: + * - File-granular (cheap, one batched read): bus factor and fix-follow-feat + * density (→ temporalFixDensity), off the result's File node. + * - Symbol-granular: the finding's enclosing symbol (resolved from + * `(uri, startLine)`) carries blast radius (`runImpact`, memoized per + * symbol + capped) and community label (one batched `MEMBER_OF` read). * - * Scope: only signals that are a direct, cheap read off the File node are - * emitted here. `ownershipDrift`/`cochangeScore` live on the community node / - * temporal table, and symbol-level signals need live computation per finding - * (blastRadius via `runImpact`, community via `MEMBER_OF`, centrality via - * PageRank) — all deliberately omitted rather than approximated. Every - * `ResultEnrichment` field is optional, so omitting them is honest, not lossy. + * Still omitted (not materialized / not worth the cost): `centrality` + * (PageRank recompute) and `cochangeScore` (temporal table). Every + * `ResultEnrichment` field is optional, so omitting is honest, not lossy. + * + * Cost control: blast radius is the only per-symbol graph traversal. It is + * memoized so N findings in one symbol cost one `runImpact`, and the number of + * distinct symbols queried is capped at {@link MAX_IMPACT_QUERIES}; symbols + * past the cap get every other signal but no blastRadius, and the run-level + * stamp records that the cap was hit (no silent truncation). * * Determinism: the enrichment is a pure function of the graph + the (already - * deterministic) SARIF; no clock or run id is emitted, so a re-scan of the - * same commit produces byte-identical enriched output. + * deterministic) SARIF; no clock or run id is emitted, and the impact cap is + * applied in a deterministic symbol order, so a re-scan of the same commit + * produces byte-identical enriched output. */ -import type { GraphNode } from "@opencodehub/core-types"; +import type { CommunityNode, GraphNode, NodeId } from "@opencodehub/core-types"; import type { EnrichmentInput, ResultEnrichment, SarifLog } from "@opencodehub/sarif"; import type { IGraphStore } from "@opencodehub/storage"; +import { + findEnclosingSymbolId, + indexNodesByFile, + type NodeRow, + type NodesByFile, +} from "./enclosing-symbol.js"; +import { runImpact } from "./impact.js"; + +/** + * Cap on distinct enclosing symbols we run `runImpact` against per scan. Each + * is a bounded graph traversal; on a large scan with findings spread across + * hundreds of symbols this bounds the added work. Symbols beyond the cap still + * get file + community signals. + */ +const MAX_IMPACT_QUERIES = 200; + +/** Kinds whose nodes can enclose a finding AND can be `runImpact` targets. */ +const SYMBOL_NODE_KINDS = ["Function", "Method", "Constructor", "Class"] as const; /** * Pull the primary-location file uri off a SARIF result and normalize it to @@ -38,6 +63,16 @@ function resultUri(result: unknown, repoPath: string): string | undefined { return toRepoRelative(loc, repoPath); } +/** The result's primary-location start line, when present (1-based, SARIF). */ +function resultStartLine(result: unknown): number | undefined { + const line = ( + result as { + locations?: ReadonlyArray<{ physicalLocation?: { region?: { startLine?: unknown } } }>; + } + ).locations?.[0]?.physicalLocation?.region?.startLine; + return typeof line === "number" && Number.isFinite(line) ? line : undefined; +} + /** Strip a leading repoPath (and `file://`) so the uri matches the graph's relative key. */ function toRepoRelative(uri: string, repoPath: string): string { // Normalize separators to POSIX first: File node ids are `/`-keyed, and on @@ -63,46 +98,36 @@ function resultFingerprint(result: unknown): string | undefined { return typeof pf === "string" && pf.length > 0 ? pf : undefined; } -/** Project the file-granular signals off a File node into a ResultEnrichment. */ -function enrichmentForFile(file: GraphNode): ResultEnrichment | undefined { - if (file.kind !== "File") return undefined; - const out: { - busFactor?: number; - temporalFixDensity?: number; - } = {}; +/** File-granular signals off a File node (bus factor, fix-follow-feat density). */ +function fileSignals(file: GraphNode): { busFactor?: number; temporalFixDensity?: number } { + if (file.kind !== "File") return {}; + const out: { busFactor?: number; temporalFixDensity?: number } = {}; if (typeof file.busFactor === "number") out.busFactor = file.busFactor; if (typeof file.fixFollowFeatDensity === "number") { out.temporalFixDensity = file.fixFollowFeatDensity; } - // `ownershipDrift` and `cochangeScore` are community-level / temporal-table - // signals, not materialized on the File node — omitted here rather than - // approximated. `blastRadius`/`community`/`centrality` need per-finding graph - // computation; a follow-up can add them behind a budget. - return Object.keys(out).length > 0 ? out : undefined; + return out; } /** - * Build the {@link EnrichmentInput} for a scan SARIF log. Returns a - * fingerprint-keyed map (`byResultFingerprint`) plus a stable run-level stamp. - * Results whose file has no materialized signals are simply absent from the - * map (the enricher leaves those results untouched). + * Build the {@link EnrichmentInput} for a scan SARIF log: a fingerprint-keyed + * map of per-result graph signals plus a stable run-level stamp. Results whose + * file the graph doesn't know, and that have no enclosing symbol, are simply + * absent from the map (the enricher leaves them untouched). * - * Defensive: a store without `listNodes` (minimal test fakes) yields only the - * run-level stamp, never a throw. + * Defensive: a store missing `listNodes`/`listEdgesByType`/`listNodesByKind` + * (minimal test fakes) degrades to whatever it can read, never throws. */ export async function buildScanEnrichment( graph: IGraphStore, sarif: SarifLog, repoPath: string, ): Promise { - const run: EnrichmentInput["run"] = { - enrichmentVersion: "1", - sources: ["graph"], - }; - if (typeof graph.listNodes !== "function") return { run }; - - // Collect the distinct file uris referenced by results, in run+result order - // so the index map lines up with how the enricher walks the log. + const baseRun: EnrichmentInput["run"] = { enrichmentVersion: "2", sources: ["graph"] }; + if (typeof graph.listNodes !== "function") return { run: baseRun }; + + // --- Collect referenced files (for file signals) + their symbol index + // (for enclosing-symbol resolution). --- const uris = new Set(); for (const r of sarif.runs) { for (const result of r.results ?? []) { @@ -110,34 +135,160 @@ export async function buildScanEnrichment( if (uri !== undefined) uris.add(uri); } } - if (uris.size === 0) return { run }; + if (uris.size === 0) return { run: baseRun }; - // One batched File-node read keyed by node id (`File::`). - const idByUri = new Map(); - for (const uri of uris) idByUri.set(uri, `File:${uri}:${uri}`); - const fileNodes = await graph.listNodes({ ids: [...idByUri.values()], kinds: ["File"] }); - const enrichmentByUri = new Map(); - for (const node of fileNodes) { - if (node.kind !== "File") continue; - const enrichment = enrichmentForFile(node); - if (enrichment !== undefined) enrichmentByUri.set(node.filePath, enrichment); - } - if (enrichmentByUri.size === 0) return { run }; + const fileSignalsByUri = await loadFileSignals(graph, uris); + const nodesByFile = await loadSymbolIndex(graph, uris); - // Key each result's enrichment by its primaryLocationLineHash fingerprint - // (run-structure-independent; see resultFingerprint). Results without a - // fingerprint or whose file has no signals are simply absent. - const byResultFingerprint = new Map(); + // --- Resolve each result's enclosing symbol, collect the distinct set. --- + interface ResultRef { + readonly fp: string; + readonly uri: string; + readonly symbolId: NodeId | undefined; + } + const refs: ResultRef[] = []; + const distinctSymbols = new Set(); for (const r of sarif.runs) { for (const result of r.results ?? []) { const fp = resultFingerprint(result); if (fp === undefined) continue; const uri = resultUri(result, repoPath); - const enrichment = uri !== undefined ? enrichmentByUri.get(uri) : undefined; - if (enrichment !== undefined) byResultFingerprint.set(fp, enrichment); + if (uri === undefined) continue; + const line = resultStartLine(result); + const symbolId = + line !== undefined ? findEnclosingSymbolId(nodesByFile, uri, line) : undefined; + if (symbolId !== undefined) distinctSymbols.add(symbolId); + refs.push({ fp, uri, symbolId }); + } + } + + // --- Symbol-level signals over the distinct symbol set (batched community + + // memoized/capped blast radius). --- + const communityBySymbol = await loadCommunityLabels(graph, [...distinctSymbols]); + const { blastBySymbol, capped } = await loadBlastRadii(graph, [...distinctSymbols].sort()); + // Record cap-truncation in the run stamp so a consumer never mistakes a + // capped scan's missing blastRadius for "symbol has no dependents". + const run: EnrichmentInput["run"] = capped + ? { enrichmentVersion: "2", sources: ["graph", "impact-capped"] } + : baseRun; + + // --- Assemble per-result enrichment, keyed by fingerprint. --- + const byResultFingerprint = new Map(); + for (const ref of refs) { + const out: { + busFactor?: number; + temporalFixDensity?: number; + blastRadius?: number; + community?: string; + } = { ...(fileSignalsByUri.get(ref.uri) ?? {}) }; + if (ref.symbolId !== undefined) { + const blast = blastBySymbol.get(ref.symbolId); + if (blast !== undefined) out.blastRadius = blast; + const community = communityBySymbol.get(ref.symbolId); + if (community !== undefined) out.community = community; } + if (Object.keys(out).length > 0) byResultFingerprint.set(ref.fp, out); } if (byResultFingerprint.size === 0) return { run }; return { byResultFingerprint, run }; } + +/** One batched File-node read → per-uri file signals (only non-empty entries). */ +async function loadFileSignals( + graph: IGraphStore, + uris: ReadonlySet, +): Promise> { + const ids = [...uris].map((u) => `File:${u}:${u}`); + const fileNodes = await graph.listNodes({ ids, kinds: ["File"] }); + const byUri = new Map(); + for (const node of fileNodes) { + if (node.kind !== "File") continue; + const sig = fileSignals(node); + if (Object.keys(sig).length > 0) byUri.set(node.filePath, sig); + } + return byUri; +} + +/** + * Load the enclosing-symbol index for the referenced files. Projects the + * symbol nodes (filtered to the referenced uris) into the shared + * `NodesByFile`. Returns an empty index when the store can't enumerate by kind. + */ +async function loadSymbolIndex( + graph: IGraphStore, + uris: ReadonlySet, +): Promise { + if (typeof graph.listNodesByKind !== "function") return new Map(); + const rows: NodeRow[] = []; + for (const kind of SYMBOL_NODE_KINDS) { + const nodes = await graph.listNodesByKind(kind); + for (const n of nodes) { + if (!uris.has(n.filePath)) continue; + const startLine = (n as { startLine?: number }).startLine; + const endLine = (n as { endLine?: number }).endLine; + if (typeof startLine !== "number" || typeof endLine !== "number") continue; + rows.push({ id: n.id, filePath: n.filePath, startLine, endLine, kind: n.kind }); + } + } + return indexNodesByFile(rows); +} + +/** Batched symbol → community-label map (one MEMBER_OF read + one node read). */ +async function loadCommunityLabels( + graph: IGraphStore, + symbolIds: readonly NodeId[], +): Promise> { + const out = new Map(); + if (symbolIds.length === 0 || typeof graph.listEdgesByType !== "function") return out; + try { + const edges = await graph.listEdgesByType("MEMBER_OF", { fromIds: [...symbolIds] }); + if (edges.length === 0) return out; + const communityIds = [...new Set(edges.map((e) => e.to))].filter((s) => s.length > 0); + if (communityIds.length === 0) return out; + const communityNodes = await graph.listNodes({ ids: communityIds, kinds: ["Community"] }); + const labelById = new Map(); + for (const node of communityNodes) { + if (node.kind !== "Community") continue; + const label = (node as CommunityNode).inferredLabel; + if (typeof label === "string" && label.length > 0) labelById.set(node.id, label); + } + for (const edge of edges) { + const label = labelById.get(edge.to); + if (label !== undefined && !out.has(edge.from as NodeId)) out.set(edge.from as NodeId, label); + } + } catch { + // Graph may have no community nodes yet — community is best-effort. + } + return out; +} + +/** + * Memoized + capped blast radius per symbol. `symbolIds` MUST be pre-sorted so + * the cap selects a deterministic subset. Returns the per-symbol upstream + * dependent count and whether the cap truncated the set. + */ +async function loadBlastRadii( + graph: IGraphStore, + symbolIds: readonly NodeId[], +): Promise<{ blastBySymbol: ReadonlyMap; capped: boolean }> { + const blastBySymbol = new Map(); + let capped = false; + for (const symbolId of symbolIds) { + if (blastBySymbol.size >= MAX_IMPACT_QUERIES) { + capped = true; + break; + } + try { + const res = await runImpact(graph, { + target: "", + targetUid: symbolId, + direction: "upstream", + }); + blastBySymbol.set(symbolId, res.totalAffected); + } catch { + // A symbol the impact traversal can't resolve contributes no blastRadius. + } + } + return { blastBySymbol, capped }; +} diff --git a/packages/cli/src/commands/find-enclosing-symbol.ts b/packages/cli/src/commands/find-enclosing-symbol.ts deleted file mode 100644 index d340c718..00000000 --- a/packages/cli/src/commands/find-enclosing-symbol.ts +++ /dev/null @@ -1,118 +0,0 @@ -/** - * `findEnclosingSymbolId` — deterministic tightest-span lookup that maps a - * `(filePath, line)` pair back to the OpenCodeHub graph node that owns the - * line (a Function / Method / Class / etc.). Used by `ingest-sarif` to link - * SARIF `Finding` nodes to the enclosing code symbol when the scanner did - * not populate `result.properties["opencodehub.symbolId"]` itself. - * - * This is a clone of the algorithm in - * `packages/ingestion/src/pipeline/phases/scip-index.ts:indexNodesByFile` + - * `findEnclosingNodeId`. The two call sites live in different packages - * (`@opencodehub/cli` vs `@opencodehub/ingestion`), and extracting a shared - * helper would require a cross-package refactor that is explicitly out of - * scope for the SARIF linkage task. If these functions need to converge - * later, promote this file to a shared util package (e.g. - * `@opencodehub/graph-utils`) and delete the duplicate in scip-index.ts in - * a single atomic change. - * - * Notes on 1-indexing: both SARIF 2.1.0 `region.startLine` and - * OpenCodeHub node `startLine`/`endLine` are 1-based, so no offset - * adjustment is needed at the call site. - */ - -import type { NodeId, NodeKind } from "@opencodehub/core-types"; - -/** A graph node projection carrying only the fields the lookup needs. */ -export interface NodeRow { - readonly id: NodeId; - readonly filePath: string; - readonly startLine: number; - readonly endLine: number; - readonly kind: NodeKind; -} - -/** Per-file, start-line-ascending index used by `findEnclosingSymbolId`. */ -export type NodesByFile = ReadonlyMap; - -/** - * Code-kind allow set used when resolving SARIF findings back to an - * enclosing symbol. Covers Function, Method, Constructor, Class, - * Interface, Struct, Enum, and Trait — a strict superset of - * `SCIP_SYMBOL_KINDS`; we additionally allow `Constructor` here because - * SARIF tooling routinely emits findings inside constructor bodies. - */ -export const ENCLOSING_SYMBOL_KINDS: ReadonlySet = new Set([ - "Function", - "Method", - "Constructor", - "Class", - "Interface", - "Struct", - "Enum", - "Trait", -]); - -/** - * Build a per-file, start-line-ascending index over the supplied node - * rows, filtering to nodes whose `kind` is in `ENCLOSING_SYMBOL_KINDS`. - * Rows missing either `startLine` or `endLine` are skipped silently — - * they cannot participate in a range containment check. - * - * Ordering: within each file the array is sorted by `startLine` ascending - * with `endLine` ascending as the tie-breaker. `findEnclosingSymbolId` - * still scans the whole candidate list for the tightest span, so the - * sort is primarily an early-break optimization (once `startLine > line` - * we can stop). - */ -export function indexNodesByFile(rows: readonly NodeRow[]): NodesByFile { - const map = new Map(); - for (const row of rows) { - if (!ENCLOSING_SYMBOL_KINDS.has(row.kind)) continue; - if (!Number.isFinite(row.startLine) || !Number.isFinite(row.endLine)) continue; - const bucket = map.get(row.filePath); - if (bucket === undefined) map.set(row.filePath, [row]); - else bucket.push(row); - } - for (const arr of map.values()) { - arr.sort((a, b) => { - if (a.startLine !== b.startLine) return a.startLine - b.startLine; - return a.endLine - b.endLine; - }); - } - return map; -} - -/** - * Return the id of the tightest-span node in `nodesByFile[filePath]` - * that encloses `line` (`startLine <= line <= endLine`). "Tightest" - * means smallest `endLine - startLine` span — this makes nested methods - * win over their containing classes. When two candidates have the same - * span, the earlier `startLine` wins (which falls out of the deterministic - * input sort). - * - * Returns `undefined` when the file is unknown, when no candidate - * contains the line, or when every candidate has been filtered out by - * the allow-set at index time. - */ -export function findEnclosingSymbolId( - nodesByFile: NodesByFile, - filePath: string, - line: number, -): NodeId | undefined { - const candidates = nodesByFile.get(filePath); - if (candidates === undefined) return undefined; - let best: NodeRow | undefined; - let bestSpan = Number.POSITIVE_INFINITY; - for (const rec of candidates) { - // Candidates are sorted by startLine; once we pass the target line - // no later row can enclose it. - if (rec.startLine > line) break; - if (rec.endLine < line) continue; - const span = rec.endLine - rec.startLine; - if (span < bestSpan) { - best = rec; - bestSpan = span; - } - } - return best?.id; -} diff --git a/packages/cli/src/commands/ingest-sarif.test.ts b/packages/cli/src/commands/ingest-sarif.test.ts index faa1f5e1..796bfa7a 100644 --- a/packages/cli/src/commands/ingest-sarif.test.ts +++ b/packages/cli/src/commands/ingest-sarif.test.ts @@ -1,8 +1,8 @@ import assert from "node:assert/strict"; import { test } from "node:test"; +import { indexNodesByFile, type NodeRow } from "@opencodehub/analysis"; import type { NodeId } from "@opencodehub/core-types"; import type { SarifRun } from "@opencodehub/sarif"; -import { indexNodesByFile, type NodeRow } from "./find-enclosing-symbol.js"; import { buildFindingsGraph } from "./ingest-sarif.js"; function run(scanner: string, results: unknown): SarifRun { diff --git a/packages/cli/src/commands/ingest-sarif.ts b/packages/cli/src/commands/ingest-sarif.ts index 1a86c5b9..67d5d931 100644 --- a/packages/cli/src/commands/ingest-sarif.ts +++ b/packages/cli/src/commands/ingest-sarif.ts @@ -24,6 +24,13 @@ import { readFile } from "node:fs/promises"; import { resolve } from "node:path"; +import { + ENCLOSING_SYMBOL_KINDS, + findEnclosingSymbolId, + indexNodesByFile, + type NodeRow, + type NodesByFile, +} from "@opencodehub/analysis"; import { type FindingNode, KnowledgeGraph, makeNodeId, type NodeId } from "@opencodehub/core-types"; import { applyBaselineState, @@ -40,13 +47,6 @@ import { resolveRepoMetaDir, } from "@opencodehub/storage"; import { readRegistry } from "../registry.js"; -import { - ENCLOSING_SYMBOL_KINDS, - findEnclosingSymbolId, - indexNodesByFile, - type NodeRow, - type NodesByFile, -} from "./find-enclosing-symbol.js"; export interface IngestSarifOptions { /** `--repo `: look up a registered repo instead of using CWD. */ diff --git a/packages/ingestion/src/pipeline/phases/scip-index.ts b/packages/ingestion/src/pipeline/phases/scip-index.ts index 2136e556..0e699dca 100644 --- a/packages/ingestion/src/pipeline/phases/scip-index.ts +++ b/packages/ingestion/src/pipeline/phases/scip-index.ts @@ -31,6 +31,13 @@ import { existsSync, statSync } from "node:fs"; import { readFile } from "node:fs/promises"; import { join } from "node:path"; +import { + findEnclosingSymbolId, + indexNodesByFile as indexNodesByFile_shared, + type NodeRow, + type NodesByFile, + SCIP_SYMBOL_KINDS, +} from "@opencodehub/analysis"; import type { GraphNode, NodeId } from "@opencodehub/core-types"; import type { DerivedEdge, @@ -408,71 +415,26 @@ function probeCachedVersion(scipPath: string): string { } } -/** Per-file, line-sorted node index for the tightest-enclosing lookup. */ -type NodesByFile = ReadonlyMap; -interface SymbolRec { - readonly id: NodeId; - readonly filePath: string; - readonly startLine: number; - readonly endLine: number; -} - -const SCIP_SYMBOL_KINDS: ReadonlySet = new Set([ - "Class", - "Method", - "Function", - "Interface", - "Struct", - "Enum", - "Trait", -]); - +/** + * Project the in-memory graph's enclosing-symbol nodes into the shared + * per-file index. The node SOURCE here (`ctx.graph.nodes()`, mid-pipeline, + * before the store exists) differs from the SARIF path's store query, so the + * projection stays local; the index build + lookup are the shared + * `@opencodehub/analysis` impl (gated on {@link SCIP_SYMBOL_KINDS}). + */ function indexNodesByFile(ctx: PipelineContext): NodesByFile { - const map = new Map(); + const rows: NodeRow[] = []; for (const n of ctx.graph.nodes()) { - if (!SCIP_SYMBOL_KINDS.has(n.kind)) continue; const startLine = (n as { startLine?: number }).startLine; const endLine = (n as { endLine?: number }).endLine; if (startLine === undefined || endLine === undefined) continue; - const rec: SymbolRec = { - id: n.id as NodeId, - filePath: n.filePath, - startLine, - endLine, - }; - const arr = map.get(n.filePath); - if (arr === undefined) map.set(n.filePath, [rec]); - else arr.push(rec); + rows.push({ id: n.id as NodeId, filePath: n.filePath, startLine, endLine, kind: n.kind }); } - for (const arr of map.values()) { - arr.sort((a, b) => { - if (a.startLine !== b.startLine) return a.startLine - b.startLine; - return a.endLine - b.endLine; - }); - } - return map; + return indexNodesByFile_shared(rows, SCIP_SYMBOL_KINDS); } -function findEnclosingNodeId( - nodesByFile: NodesByFile, - filePath: string, - line: number, -): NodeId | undefined { - const candidates = nodesByFile.get(filePath); - if (candidates === undefined) return undefined; - let best: SymbolRec | undefined; - let bestSpan = Number.POSITIVE_INFINITY; - for (const rec of candidates) { - if (rec.startLine > line) break; - if (rec.endLine < line) continue; - const span = rec.endLine - rec.startLine; - if (span < bestSpan) { - best = rec; - bestSpan = span; - } - } - return best?.id; -} +/** Local alias preserving the call-site name; the impl is the shared lookup. */ +const findEnclosingNodeId = findEnclosingSymbolId; function edgeKey(from: string, type: string, to: string): string { return `${from}\x00${type}\x00${to}`;