diff --git a/.erpaval/solutions/test-failures/run-biome-standalone-mise-lint-precheck-sandbox-fails.md b/.erpaval/solutions/test-failures/run-biome-standalone-mise-lint-precheck-sandbox-fails.md new file mode 100644 index 00000000..214d0ee1 --- /dev/null +++ b/.erpaval/solutions/test-failures/run-biome-standalone-mise-lint-precheck-sandbox-fails.md @@ -0,0 +1,53 @@ +# Run Biome standalone — the `mise run lint` precheck SIGTERMs in-sandbox + +**Category:** test-failures · **Track:** bug +**Discovered:** session-3b8ca0 (CI `lint` failed on a PR that passed local typecheck+test) + +## What happened + +Local validation ran `tsc --noEmit` + the test suites directly and called the +PR green. CI's `lint` job failed in 22s. Cause: every `mise run ` depends +on an `install` task that runs `pnpm install`, which gets **SIGTERM'd under the +sandbox** — so `mise run lint` (and `check`, which includes lint) never +actually executes Biome locally. typecheck and test were run via direct `pnpm` +invocations, but Biome was simply never run. + +Two classes of failure shipped to CI: +- `lint/suspicious/noConsole`: `console.log` is **off** in + `packages/cli/src/commands/**` (biome.json override) but **warns** in + `packages/cli/src/index.ts` (allows only `warn`/`error`). A `console.log` + added to `index.ts` fails lint; the fix is to emit stdout from a + command-module helper, not the CLI entrypoint. +- formatter line-wrapping nits Biome would rewrap. + +## The rule + +Before pushing, run Biome **standalone**, never via `mise`: + +```bash +pnpm exec biome check . # repo-wide, mirrors the CI lint job +pnpm exec biome check --write # auto-fix format + safe lint +``` + +The mise tasks (`lint`, `check`, `check:full`) all gate on the `install` +precheck that dies in-sandbox, so they are NOT a reliable local signal — invoke +the underlying tool directly (same lesson as running `tsc`/`node --test` via +`pnpm` instead of `mise run`). Generalizes: any mise task with +`depends = ["install"]` is unrunnable in-sandbox; reach for the underlying CLI. + +## stdout/console map (cli) + +`console.log` → only in `packages/cli/src/commands/**`. In `index.ts` and +elsewhere, only `console.warn` / `console.error` are allowed. Machine `--json` +output therefore belongs in a command module helper. + +## Update (session-3b8ca0, F1-F4 PR): CI runs `biome ci`, not `biome check` + +The CI `lint` job is `pnpm exec biome ci .` (see .github/workflows/ci.yml). `biome ci` +is STRICTER than `biome check`: it enforces the organize-imports/exports assist as an +ERROR (barrel exports must be alphabetically sorted), which `biome check` only auto-fixes +silently. A locally-clean `biome check .` still failed CI lint on export ordering. + +RULE: before push, run `pnpm exec biome ci .` (the exact CI command), not just +`biome check .`. To fix ordering: `pnpm exec biome check --write --unsafe .` reorders +exports (verify the diff is export-only — it is safe here, just reordering). diff --git a/packages/analysis/src/index.ts b/packages/analysis/src/index.ts index 2b17d6e0..343ad515 100644 --- a/packages/analysis/src/index.ts +++ b/packages/analysis/src/index.ts @@ -96,7 +96,7 @@ export type { } from "./license-classify.js"; export { classifyDependencies } from "./license-classify.js"; export type { OwnerRow } from "./owners.js"; -export { listOwners } from "./owners.js"; +export { collectOwnersByPath, listOwners } from "./owners.js"; export type { Adjacency, EdgeLike } from "./page-rank.js"; export { buildAdjacency, pageRank } from "./page-rank.js"; export type { OrphanGrade } from "./risk.js"; @@ -127,6 +127,7 @@ export { } from "./risk-snapshot.js"; export type { RouteMapFilter, RouteMapRow } from "./route-map.js"; export { listRouteMap } from "./route-map.js"; +export { buildScanEnrichment } from "./scan-enrich.js"; export type { ShapeStatus } from "./shape.js"; export { classifyShape } from "./shape.js"; export { computeStaleness } from "./staleness.js"; diff --git a/packages/analysis/src/owners.test.ts b/packages/analysis/src/owners.test.ts new file mode 100644 index 00000000..a0652b4e --- /dev/null +++ b/packages/analysis/src/owners.test.ts @@ -0,0 +1,103 @@ +/** + * Tests for `collectOwnersByPath` — the per-path owner map the verdict CLI + * feeds the policy engine's `ownership_required` rule. + * + * Regression context: the CLI used to hardcode `ownersByPath: new Map()`, so a + * rule with an empty `require_approval_from` (relying on graph owners) could + * never be satisfied — it always hit the "no owners" branch. These tests pin + * that the map is actually built from OWNED_BY edges. + */ + +import { strict as assert } from "node:assert"; +import { test } from "node:test"; +import { collectOwnersByPath } from "./owners.js"; +import { FakeStore } from "./test-utils.js"; + +function contributorNode(id: string, name: string, emailPlain?: string, emailHash?: string) { + return { + id, + kind: "Contributor", + name, + filePath: "", + ...(emailPlain !== undefined ? { emailPlain } : {}), + ...(emailHash !== undefined ? { emailHash } : {}), + }; +} + +test("collectOwnersByPath maps each path to its OWNED_BY owner emails", async () => { + const store = new FakeStore(); + store.addNode(contributorNode("Contributor:alice", "Alice", "alice@example.com")); + store.addNode(contributorNode("Contributor:bob", "Bob", "bob@example.com")); + store.addEdge({ + fromId: "File:src/a.ts:src/a.ts", + toId: "Contributor:alice", + type: "OWNED_BY", + confidence: 0.9, + }); + store.addEdge({ + fromId: "File:src/a.ts:src/a.ts", + toId: "Contributor:bob", + type: "OWNED_BY", + confidence: 0.4, + }); + + const map = await collectOwnersByPath(store, ["src/a.ts"]); + assert.deepEqual(map.get("src/a.ts"), ["alice@example.com", "bob@example.com"]); +}); + +test("collectOwnersByPath omits paths with no owner edges", async () => { + const store = new FakeStore(); + store.addNode(contributorNode("Contributor:alice", "Alice", "alice@example.com")); + store.addEdge({ + fromId: "File:src/a.ts:src/a.ts", + toId: "Contributor:alice", + type: "OWNED_BY", + confidence: 1, + }); + + const map = await collectOwnersByPath(store, ["src/a.ts", "src/unowned.ts"]); + assert.deepEqual(map.get("src/a.ts"), ["alice@example.com"]); + assert.equal(map.has("src/unowned.ts"), false); +}); + +test("collectOwnersByPath falls back to emailHash when plain email is absent", async () => { + const store = new FakeStore(); + store.addNode(contributorNode("Contributor:priv", "Private", undefined, "deadbeefhash")); + store.addEdge({ + fromId: "File:src/p.ts:src/p.ts", + toId: "Contributor:priv", + type: "OWNED_BY", + confidence: 1, + }); + + const map = await collectOwnersByPath(store, ["src/p.ts"]); + assert.deepEqual(map.get("src/p.ts"), ["deadbeefhash"]); +}); + +test("collectOwnersByPath returns an empty map for no files", async () => { + const store = new FakeStore(); + const map = await collectOwnersByPath(store, []); + assert.equal(map.size, 0); +}); + +test("collectOwnersByPath keeps per-path owners distinct (no cross-file bleed)", async () => { + const store = new FakeStore(); + store.addNode(contributorNode("Contributor:alice", "Alice", "alice@example.com")); + store.addNode(contributorNode("Contributor:bob", "Bob", "bob@example.com")); + store.addEdge({ + fromId: "File:src/a.ts:src/a.ts", + toId: "Contributor:alice", + type: "OWNED_BY", + confidence: 1, + }); + store.addEdge({ + fromId: "File:src/b.ts:src/b.ts", + toId: "Contributor:bob", + type: "OWNED_BY", + confidence: 1, + }); + + const map = await collectOwnersByPath(store, ["src/a.ts", "src/b.ts"]); + assert.deepEqual(map.get("src/a.ts"), ["alice@example.com"]); + assert.deepEqual(map.get("src/b.ts"), ["bob@example.com"]); +}); diff --git a/packages/analysis/src/owners.ts b/packages/analysis/src/owners.ts index 9978b19c..ae8b1931 100644 --- a/packages/analysis/src/owners.ts +++ b/packages/analysis/src/owners.ts @@ -51,3 +51,65 @@ export async function listOwners( } return owners; } + +/** + * Map each repo-relative path to its `OWNED_BY` contributor identifiers, for + * the policy engine's `ownership_required` rule. One batched edge query over + * every File node, grouped by source file — so a verdict over N changed files + * is a single graph round-trip, not N calls to {@link listOwners}. + * + * Owner identity is the contributor's plain email when present (what an + * operator writes in `require_approval_from`), falling back to the email hash. + * Identifiers per path are deduped and sorted for deterministic output. Paths + * with no owner edges are omitted (the policy engine treats a missing entry as + * "no graph owners"). An empty `files` list returns an empty map. + */ +export async function collectOwnersByPath( + graph: IGraphStore, + files: readonly string[], +): Promise> { + const out = new Map(); + if (files.length === 0) return out; + // Defensive: legacy / minimal test fakes implement only part of IGraphStore. + // A store without the edge/node readers yields an empty map rather than + // throwing — the policy engine then treats every path as "no graph owners", + // exactly as before this wiring existed. + if (typeof graph.listEdgesByType !== "function" || typeof graph.listNodesByKind !== "function") { + return out; + } + + const fileNodeIds = files.map((f) => `File:${f}:${f}`); + const edges = await graph.listEdgesByType("OWNED_BY", { fromIds: fileNodeIds }); + if (edges.length === 0) return out; + + const contributors = await graph.listNodesByKind("Contributor"); + const contribById = new Map(); + for (const c of contributors) contribById.set(c.id, c); + + // File node id back to its repo-relative path (id form is `File::`). + const pathByNodeId = new Map(); + for (let i = 0; i < files.length; i += 1) { + const path = files[i]; + const id = fileNodeIds[i]; + if (path !== undefined && id !== undefined) pathByNodeId.set(id, path); + } + + const idsByPath = new Map>(); + for (const edge of edges) { + const path = pathByNodeId.get(edge.from); + if (path === undefined) continue; + const c = contribById.get(edge.to); + if (c === undefined) continue; + const plain = typeof c.emailPlain === "string" ? c.emailPlain : ""; + const identifier = plain.length > 0 ? plain : c.emailHash; + if (identifier.length === 0) continue; + const set = idsByPath.get(path) ?? new Set(); + set.add(identifier); + idsByPath.set(path, set); + } + + for (const [path, set] of idsByPath) { + out.set(path, [...set].sort()); + } + return out; +} diff --git a/packages/analysis/src/scan-enrich.test.ts b/packages/analysis/src/scan-enrich.test.ts new file mode 100644 index 00000000..abb6b95f --- /dev/null +++ b/packages/analysis/src/scan-enrich.test.ts @@ -0,0 +1,110 @@ +/** + * Tests for `buildScanEnrichment` — the per-result graph-signal map the scan + * pipeline feeds `enrichWithProperties`. + * + * Regression context: `enrichWithProperties` had zero production callers, so + * `scan.sarif` shipped with no `opencodehub.*` graph signals. These tests pin + * that the builder maps a result to its File node's signals, keyed by the + * `primaryLocationLineHash` fingerprint (run-structure-independent). + */ + +import { strict as assert } from "node:assert"; +import { test } from "node:test"; +import type { SarifLog } from "@opencodehub/sarif"; +import { buildScanEnrichment } from "./scan-enrich.js"; +import { FakeStore } from "./test-utils.js"; + +/** Minimal SARIF result with a primary-location uri + fingerprint. */ +function result(uri: string, fingerprint: string) { + return { + ruleId: "demo-rule", + level: "warning", + message: { text: "x" }, + locations: [{ physicalLocation: { artifactLocation: { uri } } }], + partialFingerprints: { primaryLocationLineHash: fingerprint }, + }; +} + +function logWith(results: ReadonlyArray>): SarifLog { + return { + version: "2.1.0", + runs: [{ tool: { driver: { name: "demo", rules: [] } }, results }], + } as unknown as SarifLog; +} + +test("buildScanEnrichment maps a result to its File node signals by fingerprint", async () => { + const store = new FakeStore(); + store.addNode({ + id: "File:src/a.ts:src/a.ts", + kind: "File", + name: "a.ts", + filePath: "src/a.ts", + busFactor: 2, + fixFollowFeatDensity: 0.5, + }); + const log = logWith([result("src/a.ts", "fp-a")]); + + const enrichment = await buildScanEnrichment(store, log, "/repo"); + const byFp = enrichment.byResultFingerprint; + assert.ok(byFp !== undefined, "byResultFingerprint must be present"); + assert.deepEqual(byFp?.get("fp-a"), { busFactor: 2, temporalFixDensity: 0.5 }); + // Run-level stamp is deterministic (no clock / run id). + assert.deepEqual(enrichment.run, { enrichmentVersion: "1", sources: ["graph"] }); +}); + +test("buildScanEnrichment normalizes an absolute result uri to the repo-relative node id", async () => { + // Scanners (e.g. semgrep) emit absolute uris; the File node id is keyed by + // the repo-relative path, so the builder must strip the repoPath prefix or + // it would never match — the bug this normalization fixes. + const store = new FakeStore(); + store.addNode({ + id: "File:src/a.ts:src/a.ts", + kind: "File", + name: "a.ts", + filePath: "src/a.ts", + busFactor: 4, + }); + const log = logWith([result("/repo/src/a.ts", "fp-abs")]); + + const enrichment = await buildScanEnrichment(store, log, "/repo"); + assert.deepEqual(enrichment.byResultFingerprint?.get("fp-abs"), { busFactor: 4 }); +}); + +test("buildScanEnrichment omits results whose file has no materialized signals", async () => { + const store = new FakeStore(); + store.addNode({ + id: "File:src/bare.ts:src/bare.ts", + kind: "File", + name: "bare.ts", + filePath: "src/bare.ts", + }); + const log = logWith([result("src/bare.ts", "fp-bare")]); + + const enrichment = await buildScanEnrichment(store, log, "/repo"); + // No signals → no per-result map, but the run-level stamp still returns. + assert.equal(enrichment.byResultFingerprint, undefined); + assert.deepEqual(enrichment.run, { enrichmentVersion: "1", sources: ["graph"] }); +}); + +test("buildScanEnrichment is byte-stable across two runs (no clock/run id)", async () => { + const store = new FakeStore(); + store.addNode({ + id: "File:src/a.ts:src/a.ts", + kind: "File", + name: "a.ts", + filePath: "src/a.ts", + busFactor: 3, + }); + const log = logWith([result("src/a.ts", "fp-a")]); + + const a = await buildScanEnrichment(store, log, "/repo"); + const b = await buildScanEnrichment(store, log, "/repo"); + assert.deepEqual(a, b); +}); + +test("buildScanEnrichment returns only the run stamp for an empty log", async () => { + const store = new FakeStore(); + const enrichment = await buildScanEnrichment(store, logWith([]), "/repo"); + assert.equal(enrichment.byResultFingerprint, undefined); + assert.deepEqual(enrichment.run, { enrichmentVersion: "1", sources: ["graph"] }); +}); diff --git a/packages/analysis/src/scan-enrich.ts b/packages/analysis/src/scan-enrich.ts new file mode 100644 index 00000000..571b602f --- /dev/null +++ b/packages/analysis/src/scan-enrich.ts @@ -0,0 +1,143 @@ +/** + * `buildScanEnrichment` — derive graph signals for each SARIF scan result so + * `enrichWithProperties` can stamp them under `properties.opencodehub.*`. + * + * Maps every result to the File node for its primary location and reads the + * file-granular signals already materialized on that node by the ingestion + * temporal phases: bus factor and fix-follow-feat density (→ temporalFixDensity). + * + * Scope: only signals that are a direct, cheap read off the File node are + * emitted here. `ownershipDrift`/`cochangeScore` live on the community node / + * temporal table, and symbol-level signals need live computation per finding + * (blastRadius via `runImpact`, community via `MEMBER_OF`, centrality via + * PageRank) — all deliberately omitted rather than approximated. Every + * `ResultEnrichment` field is optional, so omitting them is honest, not lossy. + * + * Determinism: the enrichment is a pure function of the graph + the (already + * deterministic) SARIF; no clock or run id is emitted, so a re-scan of the + * same commit produces byte-identical enriched output. + */ + +import type { GraphNode } from "@opencodehub/core-types"; +import type { EnrichmentInput, ResultEnrichment, SarifLog } from "@opencodehub/sarif"; +import type { IGraphStore } from "@opencodehub/storage"; + +/** + * Pull the primary-location file uri off a SARIF result and normalize it to + * the repo-relative POSIX form the File node id uses (`File::`). + * Scanners emit a mix of absolute and relative uris; the graph keys files by + * the repo-relative path, so an un-normalized absolute uri would never match. + */ +function resultUri(result: unknown, repoPath: string): string | undefined { + const loc = ( + result as { + locations?: ReadonlyArray<{ physicalLocation?: { artifactLocation?: { uri?: unknown } } }>; + } + ).locations?.[0]?.physicalLocation?.artifactLocation?.uri; + if (typeof loc !== "string" || loc.length === 0) return undefined; + return toRepoRelative(loc, repoPath); +} + +/** Strip a leading repoPath (and `file://`) so the uri matches the graph's relative key. */ +function toRepoRelative(uri: string, repoPath: string): string { + // Normalize separators to POSIX first: File node ids are `/`-keyed, and on + // Windows the repoPath/uri carry backslashes, so a raw prefix compare would + // fail to strip and the lookup would never match. + let path = (uri.startsWith("file://") ? uri.slice("file://".length) : uri).split("\\").join("/"); + const repo = repoPath.split("\\").join("/"); + const prefix = repo.endsWith("/") ? repo : `${repo}/`; + if (path.startsWith(prefix)) path = path.slice(prefix.length); + return path; +} + +/** + * Read the `primaryLocationLineHash` partial fingerprint — the same key the + * enricher's `byResultFingerprint` lookup uses. `enrichWithFingerprints` runs + * before this, so every result carries one. Keying by fingerprint (not index) + * is run-structure-independent: scan merges each scanner into its own SARIF + * run, and the enricher indexes per-run, so a global index would misalign. + */ +function resultFingerprint(result: unknown): string | undefined { + const pf = (result as { partialFingerprints?: { primaryLocationLineHash?: unknown } }) + .partialFingerprints?.primaryLocationLineHash; + return typeof pf === "string" && pf.length > 0 ? pf : undefined; +} + +/** Project the file-granular signals off a File node into a ResultEnrichment. */ +function enrichmentForFile(file: GraphNode): ResultEnrichment | undefined { + if (file.kind !== "File") return undefined; + const out: { + busFactor?: number; + temporalFixDensity?: number; + } = {}; + if (typeof file.busFactor === "number") out.busFactor = file.busFactor; + if (typeof file.fixFollowFeatDensity === "number") { + out.temporalFixDensity = file.fixFollowFeatDensity; + } + // `ownershipDrift` and `cochangeScore` are community-level / temporal-table + // signals, not materialized on the File node — omitted here rather than + // approximated. `blastRadius`/`community`/`centrality` need per-finding graph + // computation; a follow-up can add them behind a budget. + return Object.keys(out).length > 0 ? out : undefined; +} + +/** + * Build the {@link EnrichmentInput} for a scan SARIF log. Returns a + * fingerprint-keyed map (`byResultFingerprint`) plus a stable run-level stamp. + * Results whose file has no materialized signals are simply absent from the + * map (the enricher leaves those results untouched). + * + * Defensive: a store without `listNodes` (minimal test fakes) yields only the + * run-level stamp, never a throw. + */ +export async function buildScanEnrichment( + graph: IGraphStore, + sarif: SarifLog, + repoPath: string, +): Promise { + const run: EnrichmentInput["run"] = { + enrichmentVersion: "1", + sources: ["graph"], + }; + if (typeof graph.listNodes !== "function") return { run }; + + // Collect the distinct file uris referenced by results, in run+result order + // so the index map lines up with how the enricher walks the log. + const uris = new Set(); + for (const r of sarif.runs) { + for (const result of r.results ?? []) { + const uri = resultUri(result, repoPath); + if (uri !== undefined) uris.add(uri); + } + } + if (uris.size === 0) return { run }; + + // One batched File-node read keyed by node id (`File::`). + const idByUri = new Map(); + for (const uri of uris) idByUri.set(uri, `File:${uri}:${uri}`); + const fileNodes = await graph.listNodes({ ids: [...idByUri.values()], kinds: ["File"] }); + const enrichmentByUri = new Map(); + for (const node of fileNodes) { + if (node.kind !== "File") continue; + const enrichment = enrichmentForFile(node); + if (enrichment !== undefined) enrichmentByUri.set(node.filePath, enrichment); + } + if (enrichmentByUri.size === 0) return { run }; + + // Key each result's enrichment by its primaryLocationLineHash fingerprint + // (run-structure-independent; see resultFingerprint). Results without a + // fingerprint or whose file has no signals are simply absent. + const byResultFingerprint = new Map(); + for (const r of sarif.runs) { + for (const result of r.results ?? []) { + const fp = resultFingerprint(result); + if (fp === undefined) continue; + const uri = resultUri(result, repoPath); + const enrichment = uri !== undefined ? enrichmentByUri.get(uri) : undefined; + if (enrichment !== undefined) byResultFingerprint.set(fp, enrichment); + } + } + if (byResultFingerprint.size === 0) return { run }; + + return { byResultFingerprint, run }; +} diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index 3d1ec906..e930aedb 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -311,6 +311,7 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi maxSummariesPerRun: resolvedMaxSummaries, ...(opts.summaryModel !== undefined ? { summaryModel: opts.summaryModel } : {}), ...(opts.strictDetectors !== undefined ? { strictDetectors: opts.strictDetectors } : {}), + ...(opts.allowBuildScripts !== undefined ? { allowBuildScripts: opts.allowBuildScripts } : {}), ...(summaryCacheAdapter !== undefined ? { summaryCacheAdapter: summaryCacheAdapter.adapter } : {}), diff --git a/packages/cli/src/commands/scan.ts b/packages/cli/src/commands/scan.ts index 46dec106..ce8b7dfe 100644 --- a/packages/cli/src/commands/scan.ts +++ b/packages/cli/src/commands/scan.ts @@ -27,22 +27,24 @@ import { readFileSync } from "node:fs"; import { mkdir, readFile, writeFile } from "node:fs/promises"; import { join, resolve } from "node:path"; +import { buildScanEnrichment } from "@opencodehub/analysis"; import { pipeline } from "@opencodehub/ingestion"; import { applyBaselineState, applySuppressions, enrichWithFingerprints, + enrichWithProperties, loadSuppressions, type SarifLog, SarifLogSchema, } from "@opencodehub/sarif"; import { ALL_SPECS, + buildScannerFileContext, CHECKOV_SPEC, createDefaultWrappers, type DefaultWrapperContext, filterSpecsByProfile, - HADOLINT_SPEC, P1_SPECS, PIP_AUDIT_SPEC, type ProjectProfileGate, @@ -50,7 +52,6 @@ import { runScanners, type ScannerSpec, type ScannerStatus, - SPECTRAL_SPEC, TY_SPEC, VULTURE_SPEC, } from "@opencodehub/scanners"; @@ -150,7 +151,14 @@ export async function runScan(path: string, opts: ScanOptions = {}): Promise { + try { + const { store } = await openStoreForCommand({ repo: opts.repo ?? repoPath, readOnly: true }); + try { + const graph = "graph" in store ? store.graph : store; + const enrichment = await buildScanEnrichment(graph, log, repoPath); + return enrichWithProperties(log, enrichment); + } finally { + await store.close(); + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`codehub scan: graph-signal enrichment skipped: ${msg}`); + return log; + } +} + /** * Read the ProjectProfile node (if present) so we can gate scanners by * detected languages, IaC types, and API contracts. If the graph is @@ -367,18 +404,16 @@ async function buildWrapperContext( specs: readonly ScannerSpec[], ): Promise { const ids = new Set(specs.map((s) => s.id)); + // Spectral contract files + hadolint Dockerfiles come from the shared + // discoverer so the CLI and MCP `scan` tool can never drift on what they + // hand those wrappers (the MCP path previously omitted it entirely). + const fileCtx = await buildScannerFileContext(repoPath, specs); const ctx: { -readonly [K in keyof DefaultWrapperContext]?: DefaultWrapperContext[K]; - } = {}; + } = { ...fileCtx }; if (ids.has(CHECKOV_SPEC.id)) { ctx.checkov = { frameworks: profile.iacTypes ?? [] }; } - if (ids.has(HADOLINT_SPEC.id)) { - ctx.hadolint = { dockerfiles: await findDockerfiles(repoPath) }; - } - if (ids.has(SPECTRAL_SPEC.id)) { - ctx.spectral = { contractFiles: await findOpenApiFiles(repoPath) }; - } if (ids.has(PIP_AUDIT_SPEC.id)) { // The wrapper auto-detects: a real requirements.txt is audited directly; // otherwise it bridges pyproject.toml through `uv export`. The transient @@ -402,79 +437,6 @@ async function buildWrapperContext( return ctx; } -/** - * Walk the repo for Dockerfile* files. Bounded to one breadth-first pass - * with a per-directory file cap so huge repos don't explode; the typical - * case has ≤5 Dockerfiles. - */ -async function findDockerfiles(repoPath: string): Promise { - const { readdir } = await import("node:fs/promises"); - const { join, relative } = await import("node:path"); - type DirEntry = import("node:fs").Dirent; - const MAX_FILES = 256; - const out: string[] = []; - const queue: string[] = [repoPath]; - while (queue.length > 0 && out.length < MAX_FILES) { - const dir = queue.shift(); - if (dir === undefined) break; - let entries: DirEntry[]; - try { - entries = await readdir(dir, { withFileTypes: true }); - } catch { - continue; - } - for (const e of entries) { - if (e.name === "node_modules" || e.name === ".git" || e.name.startsWith(".codehub")) { - continue; - } - const abs = join(dir, e.name); - if (e.isDirectory()) { - queue.push(abs); - } else if (e.isFile() && /^Dockerfile(\..+)?$/.test(e.name)) { - out.push(relative(repoPath, abs)); - } - } - } - return out; -} - -/** - * Locate OpenAPI / Swagger / AsyncAPI / Arazzo contracts. Mirrors the - * ProjectProfile API-contract detector's matching rules but just pulls - * the paths (no content sniff — good enough for Spectral invocation). - */ -async function findOpenApiFiles(repoPath: string): Promise { - const { readdir } = await import("node:fs/promises"); - const { join, relative } = await import("node:path"); - type DirEntry = import("node:fs").Dirent; - const MAX_FILES = 64; - const RE = /^(openapi|swagger|asyncapi|arazzo)\.(ya?ml|json)$/i; - const out: string[] = []; - const queue: string[] = [repoPath]; - while (queue.length > 0 && out.length < MAX_FILES) { - const dir = queue.shift(); - if (dir === undefined) break; - let entries: DirEntry[]; - try { - entries = await readdir(dir, { withFileTypes: true }); - } catch { - continue; - } - for (const e of entries) { - if (e.name === "node_modules" || e.name === ".git" || e.name.startsWith(".codehub")) { - continue; - } - const abs = join(dir, e.name); - if (e.isDirectory()) { - queue.push(abs); - } else if (e.isFile() && RE.test(e.name)) { - out.push(relative(repoPath, abs)); - } - } - } - return out; -} - function resolveSeverityThreshold(input: readonly string[] | undefined): ReadonlySet { if (input === undefined || input.length === 0) return DEFAULT_SEVERITY_THRESHOLD; const out = new Set(); diff --git a/packages/cli/src/commands/verdict.ts b/packages/cli/src/commands/verdict.ts index 864c5e4a..fdfbc76a 100644 --- a/packages/cli/src/commands/verdict.ts +++ b/packages/cli/src/commands/verdict.ts @@ -5,6 +5,7 @@ import { join } from "node:path"; import { classifyDependencies, + collectOwnersByPath, computeVerdict, type DependencyRef, type VerdictConfig, @@ -125,11 +126,19 @@ export async function runVerdict(opts: VerdictCliOptions = {}): Promise { // Only pay the dependency scan when a policy is actually loaded — the // starter (no-file) repo skips it entirely. const licenseViolations = policy !== undefined ? await collectLicenseViolations(store) : []; + // Map each changed path to its graph owners so `ownership_required` rules + // that rely on graph-derived owners (empty `require_approval_from`) can + // actually be satisfied. Only paid when a policy is loaded — the starter + // (no-policy) repo skips the graph walk entirely. + const ownersByPath = + policy !== undefined + ? await collectOwnersByPath(graph, verdict.changedFiles) + : new Map(); const policyDecision = policy !== undefined ? evaluatePolicy( policy, - buildPolicyContext(verdict, opts.approvals ?? [], licenseViolations), + buildPolicyContext(verdict, opts.approvals ?? [], licenseViolations, ownersByPath), ) : undefined; @@ -162,6 +171,7 @@ function buildPolicyContext( verdict: VerdictResponse, approvals: readonly string[], licenseViolations: readonly LicenseViolationInput[], + ownersByPath: ReadonlyMap, ): PolicyContext { return { // License findings come from `classifyDependencies` over the indexed @@ -174,11 +184,13 @@ function buildPolicyContext( // diff: the changed-file list `computeVerdict` already derived from // `detect_changes`, surfaced on the response. touchedPaths: verdict.changedFiles, - // ownersByPath stays empty: mapping each path to an approving owner - // needs a contributor-email -> team/handle reconciliation source that - // does not exist yet (separate design item). `require_approval_from` - // still works because the evaluator unions it with any path owners. - ownersByPath: new Map(), + // Each changed path's graph owners (OWNED_BY contributor emails), so an + // `ownership_required` rule that leans on graph owners — empty + // `require_approval_from` — can be satisfied instead of unconditionally + // blocking with "no owners". Owner identity is the contributor email; + // team/handle reconciliation (mapping an email to a GitHub team) remains + // a separate design item that operators bridge via `require_approval_from`. + ownersByPath, approvals, }; } diff --git a/packages/ingestion/src/pipeline/orchestrator.test.ts b/packages/ingestion/src/pipeline/orchestrator.test.ts index d247a85f..d144c93f 100644 --- a/packages/ingestion/src/pipeline/orchestrator.test.ts +++ b/packages/ingestion/src/pipeline/orchestrator.test.ts @@ -140,6 +140,7 @@ describe("runIngestion option normalization", () => { maxSummariesPerRun: 7, summaryModel: "model-x", strictDetectors: true, + allowBuildScripts: ["proleap"], }; await runIngestion(repo, { ...options, phases: [probe] }); @@ -161,6 +162,7 @@ describe("runIngestion option normalization", () => { assert.equal(seen.embeddingsBatchSize, 16); assert.equal(seen.coverage, true); assert.equal(seen.strictDetectors, true); + assert.deepEqual(seen.allowBuildScripts, ["proleap"]); }); it("drops orchestrator-only keys from ctx.options", async () => { diff --git a/packages/ingestion/src/pipeline/phases/scip-index.ts b/packages/ingestion/src/pipeline/phases/scip-index.ts index 1194c807..2136e556 100644 --- a/packages/ingestion/src/pipeline/phases/scip-index.ts +++ b/packages/ingestion/src/pipeline/phases/scip-index.ts @@ -123,10 +123,30 @@ async function runScipIndex( }; } + // Build-script opt-ins come from the analyze pipeline options; fall back to + // the legacy env var so an explicit `CODEHUB_ALLOW_BUILD_SCRIPTS=1` keeps + // working. `allowedBuildScripts` is the fine-grained whitelist; the boolean + // `allowBuildScripts` is what the rust/java indexers gate on. + const optedBuildScripts = ctx.options.allowBuildScripts ?? []; + const allowBuildScripts = + optedBuildScripts.length > 0 || process.env["CODEHUB_ALLOW_BUILD_SCRIPTS"] === "1"; + const allowedBuildScripts = optedBuildScripts; + const projectLanguages = new Set(profileNode.languages); const candidates = detectLanguages(ctx.repoPath).filter((k: IndexerKind) => projectLanguages.has(scipLangToOchLang(k)), ); + // The detector never infers the proleap kind (it has no on-disk signature + // distinct from regex COBOL). When the operator opts into proleap AND the + // project has COBOL, append the deep-parse indexer to the detected set — + // `runIndexer` still gates it on `allowedBuildScripts.includes("proleap")`. + if ( + allowedBuildScripts.includes("proleap") && + projectLanguages.has(scipLangToOchLang("cobol-proleap")) && + !candidates.includes("cobol-proleap") + ) { + candidates.push("cobol-proleap"); + } if (candidates.length === 0) { return { @@ -141,7 +161,6 @@ async function runScipIndex( const outputDir = join(ctx.repoPath, META_DIR_NAME, "scip"); const offline = Boolean(ctx.options.offline); - const allowBuildScripts = process.env["CODEHUB_ALLOW_BUILD_SCRIPTS"] === "1"; const perLang: ScipIndexPerLanguage[] = []; const nodesByFile = indexNodesByFile(ctx); @@ -176,6 +195,7 @@ async function runScipIndex( projectRoot: ctx.repoPath, outputDir, allowBuildScripts, + allowedBuildScripts, }); } catch (err) { ctx.onProgress?.({ @@ -310,13 +330,15 @@ interface ProfileNodeLike { * extension. C++-specific consumers see `clang` under the indexer name * in provenance reasons. * - * `cobol-proleap` has `provenance: null`: COBOL relations are emitted by - * the in-process tree-sitter/regex bridge during the parse phase, not via - * SCIP derivation, and `detectLanguages` never yields the proleap kind as - * a scip-index candidate — so `result.kind` at the `lookupProvenance` - * call site can never be `cobol-proleap`. The `null` states that honestly - * instead of the prior `scip-typescript` placeholder that only existed to - * satisfy switch exhaustiveness. + * `cobol-proleap` has `provenance: null`: the proleap deep-parse emits its + * relations through its own bridge, not via the generic SCIP-derivation path + * `lookupProvenance` serves. `detectLanguages` never *infers* the proleap kind + * (it has no on-disk signature beyond regex COBOL); it only enters the + * candidate set when the operator opts in via `--allow-build-scripts proleap` + * on a COBOL project, and even then `runIndexer` re-gates it on + * `allowedBuildScripts.includes("proleap")`. The `null` states the absent + * SCIP provenance honestly instead of a placeholder kept only for switch + * exhaustiveness. * * `Record` keeps the same compile-time * exhaustiveness the per-kind switches got from diff --git a/packages/ingestion/src/pipeline/types.ts b/packages/ingestion/src/pipeline/types.ts index f556e682..7b600d59 100644 --- a/packages/ingestion/src/pipeline/types.ts +++ b/packages/ingestion/src/pipeline/types.ts @@ -201,6 +201,15 @@ export interface PipelineOptions { * ts-morph. Exposed by the `codehub analyze --strict-detectors` flag. */ readonly strictDetectors?: boolean; + /** + * Opt-ins that enable build-script-driven SCIP indexers. Current surface: + * `"proleap"` wakes the JVM COBOL deep-parse bridge (`@opencodehub/cobol-proleap`). + * Unset → the COBOL hot path stays regex-only and the proleap indexer is + * never appended to the detected set. Surfaced by `codehub analyze + * --allow-build-scripts`; the `scip-index` phase reads this and falls back + * to the `CODEHUB_ALLOW_BUILD_SCRIPTS=1` env var when unset. + */ + readonly allowBuildScripts?: readonly "proleap"[]; } /** Lightweight progress event emitted during pipeline execution. */ diff --git a/packages/mcp/src/tools/scan.ts b/packages/mcp/src/tools/scan.ts index 81a33573..39517b4a 100644 --- a/packages/mcp/src/tools/scan.ts +++ b/packages/mcp/src/tools/scan.ts @@ -20,7 +20,10 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import type { SarifLog } from "@opencodehub/sarif"; import { ALL_SPECS, + buildScannerFileContext, + CHECKOV_SPEC, createDefaultWrappers, + type DefaultWrapperContext, filterSpecsByProfile, type ProjectProfileGate, runScanners, @@ -85,7 +88,12 @@ export async function runScan(ctx: ToolContext, args: ScanArgs): Promise { + const fileCtx = await buildScannerFileContext(repoPath, specs); + const ctx: DefaultWrapperContext = { ...fileCtx }; + if (specs.some((s) => s.id === CHECKOV_SPEC.id)) { + const profile = await readProfile(graph); + return { ...ctx, checkov: { frameworks: profile.iacTypes ?? [] } }; + } + return ctx; +} + async function readProfile( graph: import("@opencodehub/storage").IGraphStore, ): Promise { diff --git a/packages/scanners/src/file-context.test.ts b/packages/scanners/src/file-context.test.ts new file mode 100644 index 00000000..a13daa93 --- /dev/null +++ b/packages/scanners/src/file-context.test.ts @@ -0,0 +1,68 @@ +/** + * Tests for the shared scanner file-context discoverer. + * + * This is the single source both `codehub scan` (CLI) and the MCP `scan` tool + * use to hand Spectral its contract files and hadolint its Dockerfiles. The + * MCP path previously omitted this entirely, so Spectral silently linted + * nothing on OpenAPI repos — these tests pin the wiring. + */ + +import { strict as assert } from "node:assert"; +import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, test } from "node:test"; +import { CHECKOV_SPEC, HADOLINT_SPEC, SPECTRAL_SPEC } from "./catalog.js"; +import { buildScannerFileContext, findDockerfiles, findOpenApiFiles } from "./file-context.js"; + +let repo: string; + +beforeEach(async () => { + repo = await mkdtemp(join(tmpdir(), "codehub-filectx-")); +}); +afterEach(async () => { + await rm(repo, { recursive: true, force: true }); +}); + +test("findOpenApiFiles discovers contracts and skips node_modules/.git/.codehub", async () => { + await writeFile(join(repo, "openapi.yaml"), "openapi: 3.0.0\n"); + await mkdir(join(repo, "api"), { recursive: true }); + await writeFile(join(repo, "api", "swagger.json"), "{}"); + await mkdir(join(repo, "node_modules", "pkg"), { recursive: true }); + await writeFile(join(repo, "node_modules", "pkg", "openapi.yaml"), "ignored"); + await mkdir(join(repo, ".codehub"), { recursive: true }); + await writeFile(join(repo, ".codehub", "openapi.json"), "ignored"); + + const found = [...(await findOpenApiFiles(repo))].sort(); + assert.deepEqual(found, ["api/swagger.json", "openapi.yaml"]); +}); + +test("findDockerfiles matches Dockerfile and Dockerfile.", async () => { + await writeFile(join(repo, "Dockerfile"), "FROM scratch\n"); + await writeFile(join(repo, "Dockerfile.prod"), "FROM scratch\n"); + await writeFile(join(repo, "notadockerfile.txt"), "x"); + const found = [...(await findDockerfiles(repo))].sort(); + assert.deepEqual(found, ["Dockerfile", "Dockerfile.prod"]); +}); + +test("buildScannerFileContext populates spectral.contractFiles when SPECTRAL is selected", async () => { + await writeFile(join(repo, "openapi.yaml"), "openapi: 3.0.0\n"); + const ctx = await buildScannerFileContext(repo, [SPECTRAL_SPEC]); + assert.ok(ctx.spectral !== undefined, "spectral context must be populated"); + assert.deepEqual(ctx.spectral?.contractFiles, ["openapi.yaml"]); +}); + +test("buildScannerFileContext populates hadolint.dockerfiles when HADOLINT is selected", async () => { + await writeFile(join(repo, "Dockerfile"), "FROM scratch\n"); + const ctx = await buildScannerFileContext(repo, [HADOLINT_SPEC]); + assert.deepEqual(ctx.hadolint?.dockerfiles, ["Dockerfile"]); +}); + +test("buildScannerFileContext skips discovery for unselected specs (no wasted walk)", async () => { + await writeFile(join(repo, "openapi.yaml"), "openapi: 3.0.0\n"); + await writeFile(join(repo, "Dockerfile"), "FROM scratch\n"); + // Only checkov selected → neither spectral nor hadolint file lists built. + const ctx = await buildScannerFileContext(repo, [CHECKOV_SPEC]); + assert.equal(ctx.spectral, undefined); + assert.equal(ctx.hadolint, undefined); +}); diff --git a/packages/scanners/src/file-context.ts b/packages/scanners/src/file-context.ts new file mode 100644 index 00000000..41c30cff --- /dev/null +++ b/packages/scanners/src/file-context.ts @@ -0,0 +1,105 @@ +/** + * Shared file-discovery for the scanner wrapper context. + * + * Spectral (OpenAPI lint) and hadolint (Dockerfile lint) do not recurse a + * directory themselves — each needs an explicit file list, or it lints + * nothing. Both the CLI (`codehub scan`) and the MCP `scan` tool must supply + * that list; centralizing the walk here keeps the two surfaces from drifting + * (the MCP tool previously omitted it, so Spectral silently scanned nothing). + * + * Pure `node:fs` breadth-first walks, bounded by a file cap so a huge repo + * never explodes. No package dependencies beyond the spec ids. + */ + +import { HADOLINT_SPEC, SPECTRAL_SPEC } from "./catalog.js"; +import type { ScannerSpec } from "./spec.js"; + +/** Directory names skipped by every discovery walk. */ +const SKIP_DIRS = new Set(["node_modules", ".git"]); + +/** Matches Dockerfile, Dockerfile.prod, etc. */ +const DOCKERFILE_RE = /^Dockerfile(\..+)?$/; + +/** Matches openapi/swagger/asyncapi/arazzo .yaml|.yml|.json (case-insensitive). */ +const CONTRACT_RE = /^(openapi|swagger|asyncapi|arazzo)\.(ya?ml|json)$/i; + +interface WalkOpts { + readonly match: RegExp; + readonly maxFiles: number; +} + +/** + * Breadth-first walk returning repo-relative POSIX paths of files whose + * basename matches `opts.match`, capped at `opts.maxFiles`. Unreadable + * directories are skipped, not fatal. + */ +async function walkForFiles(repoPath: string, opts: WalkOpts): Promise { + const { readdir } = await import("node:fs/promises"); + const { join, relative, sep } = await import("node:path"); + type DirEntry = import("node:fs").Dirent; + const out: string[] = []; + const queue: string[] = [repoPath]; + while (queue.length > 0 && out.length < opts.maxFiles) { + const dir = queue.shift(); + if (dir === undefined) break; + let entries: DirEntry[]; + try { + entries = await readdir(dir, { withFileTypes: true }); + } catch { + continue; + } + for (const e of entries) { + if (SKIP_DIRS.has(e.name) || e.name.startsWith(".codehub")) continue; + const abs = join(dir, e.name); + if (e.isDirectory()) { + queue.push(abs); + } else if (e.isFile() && opts.match.test(e.name)) { + // Normalize to POSIX separators: the graph keys files by `/`-joined + // repo-relative paths, and the scanner wrappers expect the same. On + // Windows `path.relative` yields backslashes, which would never match. + out.push(relative(repoPath, abs).split(sep).join("/")); + } + } + } + return out; +} + +/** Locate OpenAPI / Swagger / AsyncAPI / Arazzo contract files (paths only). */ +export function findOpenApiFiles(repoPath: string): Promise { + return walkForFiles(repoPath, { match: CONTRACT_RE, maxFiles: 64 }); +} + +/** Locate Dockerfile* files for hadolint. */ +export function findDockerfiles(repoPath: string): Promise { + return walkForFiles(repoPath, { match: DOCKERFILE_RE, maxFiles: 256 }); +} + +/** + * File-discovery-derived slice of the wrapper context: the spectral contract + * list and the hadolint Dockerfile list, populated only for specs actually + * selected (so an OpenAPI-less repo pays no walk). Both CLI and MCP build on + * this; callers layer surface-specific fields (checkov frameworks, python + * ignore dirs) on top. + */ +export async function buildScannerFileContext( + repoPath: string, + specs: readonly ScannerSpec[], +): Promise<{ + spectral?: { contractFiles: readonly string[] }; + hadolint?: { dockerfiles: readonly string[] }; +}> { + const ids = new Set(specs.map((s) => s.id)); + const ctx: { + spectral?: { contractFiles: readonly string[] }; + hadolint?: { dockerfiles: readonly string[] }; + } = {}; + if (ids.has(SPECTRAL_SPEC.id)) { + ctx.spectral = { contractFiles: await findOpenApiFiles(repoPath) }; + } + if (ids.has(HADOLINT_SPEC.id)) { + ctx.hadolint = { dockerfiles: await findDockerfiles(repoPath) }; + } + // checkov's `frameworks` come from the ProjectProfile, not a filesystem + // walk, so each surface supplies that field from its own profile read. + return ctx; +} diff --git a/packages/scanners/src/index.ts b/packages/scanners/src/index.ts index 58b9556b..520cc69e 100644 --- a/packages/scanners/src/index.ts +++ b/packages/scanners/src/index.ts @@ -54,6 +54,11 @@ export { npmAuditJsonToSarif } from "./converters/npm-audit-to-sarif.js"; export type { PipAuditConvertOptions } from "./converters/pip-audit-to-sarif.js"; export { pipAuditJsonToSarif } from "./converters/pip-audit-to-sarif.js"; export { runBinary, tryParseJson, which } from "./exec.js"; +export { + buildScannerFileContext, + findDockerfiles, + findOpenApiFiles, +} from "./file-context.js"; export type { RunScannersOptions, RunScannersResult,