From 5e57f6df01c108811beecc2ab1abc26198f0286f Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Tue, 30 Jun 2026 03:26:18 +0000 Subject: [PATCH] feat(frameworks): wire stage-5 import/SCIP detection into the profile phase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 5 (imports.ts: detect frameworks from IMPORTS edges to external-module stubs) shipped as a tested standalone module but reached no detection — the profile phase ran at deps:[scan], before the parse phase emits IMPORTS edges, so there was no graph to read. The DAG-ordering decision: profile now deps [scan, parse]. parse deps [scan, structure] and neither deps profile, so no cycle; profile's own consumers (dependencies / repo-node / scip-index) only read its output and are unaffected by it running later. The parse phase emits the external-stub IMPORTS edges stage 5 needs (parse.ts), so they're present by detection time — no need to wait for crossFile (which only resolves the in-repo IMPORTS edges). Wiring: - profile passes ctx.graph as `importGraph` to detectFrameworksDetailed; KnowledgeGraph structurally satisfies the stage's edges()/getNode() reader. - FrameworkDetectionInput + FrameworkDetectorInput gain optional `importGraph`; the dispatcher runs detectFromImports once, groups by framework, and merges as stage-5 evidence. A `deterministic` (scip-resolved, confidence 1.0) import CREATES a detection on its own — an `import fastapi` is as authoritative as a manifest dep, and inferConfidence treats it as deterministic; a `heuristic` import only corroborates a hit from another stage. - Legacy callers that omit `importGraph` are unaffected (no-op). Updated the orchestrator phase-order test for the new topological order (profile + its dependents now follow parse/complexity/orm) — captured from a real run, not hand-derived. Tests: 4 new detector stage-5 cases (deterministic-creates, heuristic-doesn't, heuristic-corroborates, omitted-no-op). frameworks 96 + ingestion 638 green; full workspace 0 fail; biome ci + typecheck clean. Note: like the symbol-enrichment work, a live-analyze e2e of stage 5 wasn't possible in this sandbox (WASM parse emits no external-import stubs without the scip indexers); covered at the unit level with an ImportStageGraph fixture. --- packages/frameworks/src/detector.test.ts | 85 +++++++++++++++++++ packages/frameworks/src/detector.ts | 72 ++++++++++++++-- packages/frameworks/src/frameworks.ts | 10 +++ packages/frameworks/src/index.ts | 16 ++-- .../src/pipeline/orchestrator.test.ts | 24 ++---- .../ingestion/src/pipeline/phases/profile.ts | 10 ++- 6 files changed, 183 insertions(+), 34 deletions(-) diff --git a/packages/frameworks/src/detector.test.ts b/packages/frameworks/src/detector.test.ts index efb9ca96..93276afb 100644 --- a/packages/frameworks/src/detector.test.ts +++ b/packages/frameworks/src/detector.test.ts @@ -30,12 +30,39 @@ function mkInput( manifests: ReadonlyArray, detectedLanguages: readonly string[], configText?: ReadonlyArray, + importGraph?: FrameworkDetectorInput["importGraph"], ): FrameworkDetectorInput { return { relPaths: new Set(files), manifestText: new Map(manifests), detectedLanguages, ...(configText !== undefined ? { configText: new Map(configText) } : {}), + ...(importGraph !== undefined ? { importGraph } : {}), + }; +} + +/** + * Build a minimal ImportStageGraph: each spec is an IMPORTS edge to an + * external-stub CodeElement node carrying `external import: :` + * content. `confidence` 1 = scip-resolved (deterministic), <1 = heuristic. + */ +function makeImportGraph( + specs: ReadonlyArray<{ source: string; symbol?: string; confidence: number }>, +): FrameworkDetectorInput["importGraph"] { + const nodes = new Map(); + const edges: Array<{ from: string; to: string; type: string; confidence: number }> = []; + specs.forEach((s, i) => { + const to = `CodeElement:ext:${s.source}:${i}`; + nodes.set(to, { + id: to, + kind: "CodeElement", + content: `external import: ${s.source}:${s.symbol ?? "*"}`, + }); + edges.push({ from: `File:src/app.py:${i}`, to, type: "IMPORTS", confidence: s.confidence }); + }); + return { + edges: () => edges[Symbol.iterator](), + getNode: (id: string) => nodes.get(id), }; } @@ -868,3 +895,61 @@ describe("stage 3 — config-AST evidence", () => { assert.ok((b?.evidence.filter((e) => e.stage === 3) ?? []).length > 0); }); }); + +// --------------------------------------------------------------------------- +// Stage 5 — import-graph evidence (wired via importGraph) +// --------------------------------------------------------------------------- + +describe("stage 5 — import-graph evidence", () => { + it("a deterministic import CREATES a detection (scip-resolved is authoritative)", () => { + // No manifest, no layout marker — a scip-resolved `import fastapi` alone. + const input = mkInput( + ["src/app.py"], + [], + ["python"], + undefined, + makeImportGraph([{ source: "fastapi", symbol: "FastAPI", confidence: 1 }]), + ); + const out = detectFrameworksStructured(input); + const fastapi = findByName(out, "fastapi"); + assert.ok(fastapi, "deterministic import should create the fastapi detection"); + assert.equal(fastapi.confidence, "deterministic"); + const stage5 = fastapi.evidence.filter((e) => e.stage === 5); + assert.ok(stage5.length > 0, "stage-5 evidence should be attached"); + }); + + it("a heuristic import does NOT create a detection on its own (corroborates only)", () => { + const input = mkInput( + ["src/app.py"], + [], + ["python"], + undefined, + makeImportGraph([{ source: "fastapi", symbol: "FastAPI", confidence: 0.5 }]), + ); + const out = detectFrameworksStructured(input); + assert.equal(findByName(out, "fastapi"), undefined, "heuristic import alone must not detect"); + }); + + it("a heuristic import corroborates a manifest hit with stage-5 evidence", () => { + const input = mkInput( + ["pyproject.toml", "src/app.py"], + [["pyproject.toml", '[project]\ndependencies = ["fastapi>=0.110"]\n']], + ["python"], + undefined, + makeImportGraph([{ source: "fastapi", symbol: "FastAPI", confidence: 0.5 }]), + ); + const out = detectFrameworksStructured(input); + const fastapi = findByName(out, "fastapi"); + assert.ok(fastapi, "manifest hit detects fastapi"); + assert.ok( + fastapi.evidence.some((e) => e.stage === 5), + "heuristic import should still attach stage-5 evidence to the existing hit", + ); + }); + + it("is a no-op when importGraph is omitted (legacy callers unchanged)", () => { + const input = mkInput(["src/app.py"], [], ["python"]); + const out = detectFrameworksStructured(input); + assert.equal(findByName(out, "fastapi"), undefined); + }); +}); diff --git a/packages/frameworks/src/detector.ts b/packages/frameworks/src/detector.ts index 2210bd59..d7c18f57 100644 --- a/packages/frameworks/src/detector.ts +++ b/packages/frameworks/src/detector.ts @@ -32,6 +32,7 @@ import { type ManifestKey, } from "./catalog.js"; import { type ConfigAstFinding, inspectConfigAst } from "./stages/config-ast.js"; +import { detectFromImports, type ImportFinding, type ImportStageGraph } from "./stages/imports.js"; import { VARIANT_RESOLVERS, type VariantResolveInput, @@ -66,6 +67,15 @@ export interface FrameworkDetectorInput { * legacy callers — stage 3 simply contributes no evidence. */ readonly configText?: ReadonlyMap; + /** + * Stage 5 — the import graph (parse-phase `KnowledgeGraph`). When present, + * `detectFromImports` reads its IMPORTS edges to external stubs; the findings + * merge as stage-5 evidence. A `deterministic` import (scip-resolved, + * confidence 1.0) is authoritative enough to CREATE a detection on its own + * (an `import fastapi` is as strong as a manifest dep); a `heuristic` import + * only corroborates a framework that already hit. Absent for legacy callers. + */ + readonly importGraph?: ImportStageGraph; } /** Mapping language → ecosystem. Covers the tree-sitter languages OpenCodeHub indexes. */ @@ -100,11 +110,20 @@ export function detectFrameworksStructured( // framework already hit on a manifest/layout signal (stage 3 corroborates, // never creates). const configFindingsByFramework = groupConfigFindings(input.configText, input.relPaths); + // Stage 5 — import-graph findings, grouped by framework. A `deterministic` + // import can create a detection; a `heuristic` one only corroborates. + const importFindingsByFramework = groupImportFindings(input.importGraph); const out: FrameworkDetection[] = []; for (const rule of FRAMEWORK_CATALOG) { if (rule.ecosystem !== "any" && !activeEcosystems.has(rule.ecosystem)) continue; - const hit = evaluateRule(rule, input, manifestJson, configFindingsByFramework.get(rule.name)); + const hit = evaluateRule( + rule, + input, + manifestJson, + configFindingsByFramework.get(rule.name), + importFindingsByFramework.get(rule.name), + ); if (hit === null) continue; const detection = buildDetection( rule, @@ -139,6 +158,24 @@ function groupConfigFindings( return grouped; } +/** + * Run stage 5 (import graph) once and group its findings by framework name, so + * `evaluateRule` can look up a rule's import findings by `rule.name`. Returns + * an empty map when no import graph was supplied (legacy callers). + */ +function groupImportFindings( + importGraph: ImportStageGraph | undefined, +): ReadonlyMap { + const grouped = new Map(); + if (importGraph === undefined) return grouped; + for (const finding of detectFromImports(importGraph)) { + const list = grouped.get(finding.framework) ?? []; + list.push(finding); + grouped.set(finding.framework, list); + } + return grouped; +} + // --------------------------------------------------------------------------- // Evaluation helpers // --------------------------------------------------------------------------- @@ -153,6 +190,8 @@ interface RuleHit { readonly hasManifestHit: boolean; /** Whether a layout/heuristic (stage 4, tier H) signal fired. */ readonly hasFileHit: boolean; + /** Whether a scip-resolved (stage 5, deterministic) import fired. */ + readonly hasDeterministicImport: boolean; } function evidenceKey(e: Evidence): string { @@ -164,10 +203,12 @@ function evaluateRule( input: FrameworkDetectorInput, manifestJson: ReadonlyMap, configFindings: readonly ConfigAstFinding[] | undefined, + importFindings: readonly ImportFinding[] | undefined, ): RuleHit | null { const evidenceSeen = new Map(); let hasManifestHit = false; let hasFileHit = false; + let hasDeterministicImport = false; const push = (e: Evidence): void => { const key = evidenceKey(e); @@ -209,23 +250,35 @@ function evaluateRule( } } - // Stage 3 — config-AST corroboration. Only merged when a manifest/layout - // signal already fired: config text alone never creates a detection (a repo - // can carry a vendored config without using the framework). Stage-3 evidence - // sharpens an existing hit (e.g. Next.js App vs Pages router). - if ((hasManifestHit || hasFileHit) && configFindings !== undefined) { + // Stage 5 — import-graph signal. A `deterministic` import (scip-resolved) is + // authoritative enough to create a detection on its own; a `heuristic` one is + // recorded but only corroborates a hit from another stage. Evaluated before + // the stage-3 gate so a deterministic import satisfies the "something hit" + // condition. + if (importFindings !== undefined) { + for (const f of importFindings) { + push({ stage: 5, source: f.source, detail: `import: ${f.source} (${f.confidence})` }); + if (f.confidence === "deterministic") hasDeterministicImport = true; + } + } + + // Stage 3 — config-AST corroboration. Only merged when a manifest/layout/ + // import signal already fired: config text alone never creates a detection + // (a repo can carry a vendored config without using the framework). Stage-3 + // evidence sharpens an existing hit (e.g. Next.js App vs Pages router). + if ((hasManifestHit || hasFileHit || hasDeterministicImport) && configFindings !== undefined) { for (const f of configFindings) { push({ stage: 3, source: f.source, detail: f.detail }); } } - if (!hasManifestHit && !hasFileHit) return null; + if (!hasManifestHit && !hasFileHit && !hasDeterministicImport) return null; const sorted = [...evidenceSeen.values()].sort((a, b) => { if (a.stage !== b.stage) return a.stage - b.stage; if (a.source !== b.source) return a.source < b.source ? -1 : 1; return a.detail < b.detail ? -1 : a.detail > b.detail ? 1 : 0; }); - return { evidence: sorted, hasManifestHit, hasFileHit }; + return { evidence: sorted, hasManifestHit, hasFileHit, hasDeterministicImport }; } function matchManifestKey( @@ -268,7 +321,8 @@ function buildDetection( function inferConfidence(rule: FrameworkRule, hit: RuleHit): FrameworkDetection["confidence"] { if (rule.tier === "C") return "composite"; - if (hit.hasManifestHit) return "deterministic"; + // A manifest dep or a scip-resolved import are both authoritative. + if (hit.hasManifestHit || hit.hasDeterministicImport) return "deterministic"; // tier D/H with only file-level hits → heuristic. return "heuristic"; } diff --git a/packages/frameworks/src/frameworks.ts b/packages/frameworks/src/frameworks.ts index 696cffea..b48a867e 100644 --- a/packages/frameworks/src/frameworks.ts +++ b/packages/frameworks/src/frameworks.ts @@ -18,6 +18,7 @@ import { promises as fs } from "node:fs"; import path from "node:path"; import { detectFrameworksStructured } from "./detector.js"; import { CONFIG_AST_FILES } from "./stages/config-ast.js"; +import type { ImportStageGraph } from "./stages/imports.js"; import { indexResolutions, KNOWN_LOCKFILES, parseLockfile } from "./stages/lockfile.js"; /** @@ -42,6 +43,13 @@ export interface FrameworkDetectionInput { * every ecosystem" when omitted (keeps the legacy contract). */ readonly detectedLanguages?: readonly string[]; + /** + * Optional stage-5 import graph (the `KnowledgeGraph` after the parse phase). + * When supplied, `detectFromImports` reads its IMPORTS edges to external + * stubs and feeds the findings into detection. Absent for callers that run + * before parse or don't hold a graph — stage 5 then contributes nothing. + */ + readonly importGraph?: ImportStageGraph; } /** @@ -163,6 +171,7 @@ export async function detectFrameworks(input: FrameworkDetectionInput): Promise< lockfileVersions, configText, detectedLanguages: input.detectedLanguages ?? ALL_ECOSYSTEM_LANGUAGES, + ...(input.importGraph !== undefined ? { importGraph: input.importGraph } : {}), }); return detections.map((d) => d.name); } @@ -188,5 +197,6 @@ export async function detectFrameworksDetailed( lockfileVersions, configText, detectedLanguages: input.detectedLanguages ?? ALL_ECOSYSTEM_LANGUAGES, + ...(input.importGraph !== undefined ? { importGraph: input.importGraph } : {}), }); } diff --git a/packages/frameworks/src/index.ts b/packages/frameworks/src/index.ts index f0d021ab..099b7d9f 100644 --- a/packages/frameworks/src/index.ts +++ b/packages/frameworks/src/index.ts @@ -1,7 +1,7 @@ /** * `@opencodehub/frameworks` — framework detection over a curated catalog. * - * The dispatcher (`detector.ts`) merges four stages into each + * The dispatcher (`detector.ts`) merges five stages into each * `FrameworkDetection` (`{name, version?, confidence, evidence[]}`): * 1. Manifest presence + declared deps (`package.json`, `pyproject.toml`, * `pom.xml`, …) @@ -15,15 +15,13 @@ * (it corroborates, never creates a detection on its own). * 4. Folder / file-marker convention (`app/`, `pages/`, `vite.config.ts`, * `src/main/java/`, …) + * 5. Import / SCIP (`imports.ts`) — reads the graph's `IMPORTS` edges to + * external-import stubs via the `importGraph` input. The profile phase + * depends on `parse` so those edges exist by detection time. A + * `deterministic` (scip-resolved) import can create a detection on its + * own; a `heuristic` import only corroborates an existing hit. * - * One stage ships as a standalone, independently tested module but is not yet - * wired into the ingestion profile phase: - * 5. Import / SCIP (`imports.ts`) — consumes the graph's `IMPORTS` edges, - * which the profile phase (deps: [scan]) runs before. Wiring it needs a - * phase-ordering change (run framework detection after `crossFile`); a - * caller that already holds the resolved graph can pass it through today. - * - * Every stage is pure-local file-system + string/regex inspection; no + * Every stage is pure-local file-system / graph + string/regex inspection; no * network, no LLM, no subprocess. */ diff --git a/packages/ingestion/src/pipeline/orchestrator.test.ts b/packages/ingestion/src/pipeline/orchestrator.test.ts index d144c93f..264c5363 100644 --- a/packages/ingestion/src/pipeline/orchestrator.test.ts +++ b/packages/ingestion/src/pipeline/orchestrator.test.ts @@ -32,35 +32,29 @@ describe("runIngestion (end-to-end)", () => { assert.ok(result.graphHash.length === 64, "graphHash must be sha256 hex"); assert.ok(result.stats.nodeCount >= 2, "should have File + definition nodes"); assert.ok(result.stats.edgeCount >= 1); - // Topological order with alphabetic tiebreak — parse's descendants - // (orm/routes/tools) sort lexicographically, and crossFile/mro/ - // communities/processes/annotate follow. + // Topological order with alphabetic tiebreak. `profile` now depends on + // `parse` (so framework-detection stage 5 sees the parse-emitted IMPORTS + // edges), which lands it — plus its dependents `dependencies`/`repo-node`/ + // `coverage`/`sbom` — after `parse`/`complexity`/`orm`. assert.deepEqual( result.stats.phases.map((p) => p.name), [ "scan", "incremental-scope", - "profile", - "dependencies", - // `repo-node` depends on `profile` only, so the topological - // alphabetic tiebreak lands it after `dependencies` and before `sbom`. - "repo-node", - "sbom", "structure", - // `coverage` becomes runnable once `structure` completes; the - // alphabetic tiebreak lands it before `markdown`. - "coverage", "markdown", "parse", - // `business-logic` depends on parse + scan; once parse completes it is - // ready alongside complexity/orm/routes and the alphabetic tiebreak - // ("business-logic" < "complexity") lands it first. "business-logic", "complexity", "orm", + "profile", + "coverage", + "dependencies", + "repo-node", "routes", "fetches", "openapi", + "sbom", "temporal", "cochange", "tools", diff --git a/packages/ingestion/src/pipeline/phases/profile.ts b/packages/ingestion/src/pipeline/phases/profile.ts index 6ed8256e..bb4e2dfd 100644 --- a/packages/ingestion/src/pipeline/phases/profile.ts +++ b/packages/ingestion/src/pipeline/phases/profile.ts @@ -31,6 +31,7 @@ import { detectIaCTypes } from "../profile-detectors/iac.js"; import { detectLanguages } from "../profile-detectors/languages.js"; import { detectSrcDirs } from "../profile-detectors/src-dirs.js"; import type { PipelineContext, PipelinePhase } from "../types.js"; +import { PARSE_PHASE_NAME } from "./parse.js"; import { SCAN_PHASE_NAME, type ScanOutput } from "./scan.js"; export const PROFILE_PHASE_NAME = "profile" as const; @@ -43,7 +44,10 @@ export interface ProfileOutput { export const profilePhase: PipelinePhase = { name: PROFILE_PHASE_NAME, - deps: [SCAN_PHASE_NAME], + // `parse` is a dep (not just `scan`) so the import-graph IMPORTS edges that + // framework detection stage 5 reads from `ctx.graph` are guaranteed present. + // No cycle: parse deps [scan, structure], neither of which deps profile. + deps: [SCAN_PHASE_NAME, PARSE_PHASE_NAME], async run(ctx, deps) { const scan = deps.get(SCAN_PHASE_NAME) as ScanOutput | undefined; if (scan === undefined) { @@ -68,6 +72,10 @@ async function runProfile(ctx: PipelineContext, scan: ScanOutput): Promise