From ffa7e7313fcf466b7467f9ff21331d37fc9e0c32 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 18:06:03 +0200 Subject: [PATCH 01/11] test(eval): add ContextBench harness core --- scripts/contextbench-retrieval-gate.mjs | 1249 ++++++ scripts/contextbench-runner.mjs | 3586 +++++++++++++++++ src/eval/contextbench-answer.ts | 229 ++ src/eval/contextbench-artifacts.ts | 184 + src/eval/contextbench-evidence-gate.ts | 422 ++ src/eval/contextbench-scoring.ts | 107 + src/eval/contextbench-trajectory.ts | 77 + src/eval/contextbench-types.ts | 434 ++ tests/contextbench-baseline-runner.test.ts | 1095 +++++ .../contextbench-baseline-schema-gate.test.ts | 944 +++++ tests/contextbench-baseline-snapshot.test.ts | 133 + tests/contextbench-lane-setup.test.ts | 156 + ...contextbench-phase42-evidence-gate.test.ts | 372 ++ tests/contextbench-runner-contract.test.ts | 321 ++ tests/contextbench-scoring.test.ts | 97 + tests/contextbench-trajectory.test.ts | 68 + ...tbench-codebase-context-baseline-arms.json | 49 + .../contextbench-lane-setup-evidence.json | 147 + .../contextbench-lane-tool-cards.json | 203 + 19 files changed, 9873 insertions(+) create mode 100644 scripts/contextbench-retrieval-gate.mjs create mode 100644 scripts/contextbench-runner.mjs create mode 100644 src/eval/contextbench-answer.ts create mode 100644 src/eval/contextbench-artifacts.ts create mode 100644 src/eval/contextbench-evidence-gate.ts create mode 100644 src/eval/contextbench-scoring.ts create mode 100644 src/eval/contextbench-trajectory.ts create mode 100644 src/eval/contextbench-types.ts create mode 100644 tests/contextbench-baseline-runner.test.ts create mode 100644 tests/contextbench-baseline-schema-gate.test.ts create mode 100644 tests/contextbench-baseline-snapshot.test.ts create mode 100644 tests/contextbench-lane-setup.test.ts create mode 100644 tests/contextbench-phase42-evidence-gate.test.ts create mode 100644 tests/contextbench-runner-contract.test.ts create mode 100644 tests/contextbench-scoring.test.ts create mode 100644 tests/contextbench-trajectory.test.ts create mode 100644 tests/fixtures/contextbench-codebase-context-baseline-arms.json create mode 100644 tests/fixtures/contextbench-lane-setup-evidence.json create mode 100644 tests/fixtures/contextbench-lane-tool-cards.json diff --git a/scripts/contextbench-retrieval-gate.mjs b/scripts/contextbench-retrieval-gate.mjs new file mode 100644 index 0000000..d81b9e5 --- /dev/null +++ b/scripts/contextbench-retrieval-gate.mjs @@ -0,0 +1,1249 @@ +#!/usr/bin/env node +import { createHash } from 'node:crypto'; +import { spawn, spawnSync } from 'node:child_process'; +import { + appendFileSync, + existsSync, + mkdirSync, + readdirSync, + readFileSync, + statSync, + writeFileSync +} from 'node:fs'; +import { dirname, extname, isAbsolute, join, relative, resolve } from 'node:path'; + +const DEFAULT_PAYLOADS = + 'benchmark-runs/contextbench/phase40/task-payloads/contextbench-phase40-task-payloads.json'; +const DEFAULT_TASK_ID = 'Multi-SWE-Bench__c__maintenance__bugfix__5e659108'; +const DEFAULT_GOLD = + 'benchmark-runs/contextbench/phase40/scoring-inputs/Multi-SWE-Bench__c__maintenance__bugfix__5e659108-gold.json'; +const DEFAULT_LANES = ['raw-native', 'codebase-context']; +const TEXT_EXTENSIONS = new Set([ + '.c', + '.cc', + '.cpp', + '.cxx', + '.h', + '.hpp', + '.hh', + '.go', + '.java', + '.js', + '.jsx', + '.ts', + '.tsx', + '.py', + '.rb', + '.rs', + '.php', + '.swift', + '.kt', + '.scala', + '.cs', + '.m', + '.mm', + '.pony', + '.md', + '.txt', + '.json', + '.yaml', + '.yml', + '.toml', + '.xml', + '.html', + '.css', + '.scss', + '.sql', + '.sh', + '.bat', + '.ps1' +]); +const EXCLUDED_DIRS = new Set([ + '.git', + '.hg', + '.svn', + '.codebase-context', + 'node_modules', + 'vendor', + 'dist', + 'build', + 'target', + '__pycache__', + '.pytest_cache', + '.mypy_cache', + '.gradle', + '.idea', + '.vscode' +]); + +function help() { + console.log(`ContextBench retrieval-only diagnostic gate + +Usage: + node scripts/contextbench-retrieval-gate.mjs --out benchmark-runs/contextbench/phase40/ + node scripts/contextbench-retrieval-gate.mjs --out --task-id --lanes raw-native,codebase-context,jcodemunch-repomapper,codegraphcontext --score + +Options: + --out Required output session under benchmark-runs/contextbench/phase40/. + --task-payloads Materialized task payloads JSON. Defaults to Phase 40 payloads. + --task-id Frozen ContextBench instance id. Defaults to the first Phase 40 task. + --gold Scorer-only gold JSON. Used only after trajectory artifacts are written. + --lanes Lanes to run. Supported: raw-native, codebase-context, jcodemunch-repomapper, codegraphcontext. + --limit Max retrieved files per lane. Default: 6. + --window Line window around lexical hits or parsed result spans. Default: 40. + --repeat Repeat index for manifest/run id. Default: 1. + --index-timeout-ms Per-lane indexing timeout. Default: 300000. + --query-timeout-ms Per-lane query timeout. Default: 180000. + --evaluator-cwd Optional checkout containing contextbench/evaluate.py. + --score Run official ContextBench evaluator after writing each trajectory. + +This is retrieval-only evidence. It does not run an agent, write a patch, execute tests, or prove task success. +`); +} + +function parseArgs(argv) { + const args = { + taskPayloads: DEFAULT_PAYLOADS, + taskId: DEFAULT_TASK_ID, + gold: DEFAULT_GOLD, + lanes: DEFAULT_LANES, + limit: 6, + window: 40, + repeat: 1, + indexTimeoutMs: 300_000, + queryTimeoutMs: 180_000, + score: false + }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (arg === '--help' || arg === '-h') args.help = true; + else if (arg === '--out') args.out = argv[++i] ?? ''; + else if (arg === '--task-payloads') args.taskPayloads = argv[++i] ?? ''; + else if (arg === '--task-id') args.taskId = argv[++i] ?? ''; + else if (arg === '--gold') args.gold = argv[++i] ?? ''; + else if (arg === '--lanes') + args.lanes = String(argv[++i] ?? '') + .split(',') + .filter(Boolean); + else if (arg === '--limit') args.limit = Number(argv[++i] ?? '6'); + else if (arg === '--window') args.window = Number(argv[++i] ?? '40'); + else if (arg === '--repeat') args.repeat = Number(argv[++i] ?? '1'); + else if (arg === '--index-timeout-ms') args.indexTimeoutMs = Number(argv[++i] ?? '300000'); + else if (arg === '--query-timeout-ms') args.queryTimeoutMs = Number(argv[++i] ?? '180000'); + else if (arg === '--evaluator-cwd') args.evaluatorCwd = argv[++i] ?? ''; + else if (arg === '--score') args.score = true; + else throw new Error(`Unknown argument: ${arg}`); + } + return args; +} + +function stableStringify(value) { + if (value === null || typeof value !== 'object') return JSON.stringify(value); + if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(',')}]`; + const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)); + return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(',')}}`; +} + +function sha256Text(value) { + return `sha256:${createHash('sha256').update(value, 'utf8').digest('hex')}`; +} + +function sha256File(filePath) { + return `sha256:${createHash('sha256').update(readFileSync(filePath)).digest('hex')}`; +} + +function readJson(filePath) { + return JSON.parse(readFileSync(filePath, 'utf8')); +} + +function writeJson(filePath, value) { + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8'); +} + +function writeText(filePath, value) { + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, value, 'utf8'); +} + +function normalizePath(filePath) { + return filePath.replace(/\\/g, '/').replace(/^\.\//, '').replace(/^\/+/, ''); +} + +function normalizeRepoPath(repoRoot, filePath) { + const normalized = normalizePath(filePath); + const root = normalizePath(repoRoot).replace(/\/$/, ''); + if (normalized.toLowerCase().startsWith(`${root.toLowerCase()}/`)) { + return normalized.slice(root.length + 1); + } + return normalized; +} + +function ensurePhase40Out(outDir) { + if (!outDir) throw new Error('--out is required'); + const resolved = resolve(outDir); + const normalized = normalizePath(resolved); + if (!normalized.includes('/benchmark-runs/contextbench/phase40/')) { + throw new Error( + 'retrieval gate output must be under benchmark-runs/contextbench/phase40/' + ); + } + if (normalized.includes('/outputs/')) + throw new Error('retrieval gate output must not be under outputs/'); + mkdirSync(resolved, { recursive: true }); + return resolved; +} + +function sanitize(value) { + return value + .replace(/[^a-zA-Z0-9_.-]+/g, '-') + .replace(/-+/g, '-') + .replace(/^-|-$/g, ''); +} + +function loadTask(payloadPath, taskId) { + const payload = readJson(resolve(payloadPath)); + const tasks = Array.isArray(payload.tasks) + ? payload.tasks + : Object.entries(payload.tasksById ?? {}).map(([instanceId, value]) => ({ + instance_id: instanceId, + ...value + })); + const task = tasks.find((candidate) => candidate.instance_id === taskId); + if (!task) throw new Error(`task id not found in payloads: ${taskId}`); + if (!task.problem_statement || !task.repo_checkout_path) { + throw new Error( + `task ${taskId} is not materialized with problem_statement and repo_checkout_path` + ); + } + const checkout = isAbsolute(task.repo_checkout_path) + ? task.repo_checkout_path + : resolve(dirname(resolve(payloadPath)), task.repo_checkout_path); + if (!existsSync(checkout)) throw new Error(`task checkout does not exist: ${checkout}`); + return { ...task, repo_checkout_path: checkout }; +} + +function tokenize(query) { + const stopWords = new Set([ + 'the', + 'and', + 'for', + 'from', + 'with', + 'this', + 'that', + 'when', + 'into', + 'are', + 'not', + 'but', + 'should', + 'would', + 'could', + 'have', + 'has', + 'had', + 'body', + 'bodies', + 'method', + 'methods' + ]); + return [ + ...new Set( + String(query) + .toLowerCase() + .match(/[a-z_][a-z0-9_]{2,}|#[0-9]+/g) + ?.filter((token) => !stopWords.has(token)) ?? [] + ) + ]; +} + +function isTextLike(filePath, stats) { + if (stats.size > 1_000_000) return false; + const ext = extname(filePath).toLowerCase(); + if (TEXT_EXTENSIONS.has(ext)) return true; + return ext === '' && stats.size < 200_000; +} + +function collectFiles(root) { + const files = []; + const stack = [root]; + while (stack.length > 0) { + const dir = stack.pop(); + for (const entry of readdirSync(dir, { withFileTypes: true })) { + if (entry.isDirectory()) { + if (!EXCLUDED_DIRS.has(entry.name)) stack.push(join(dir, entry.name)); + } else if (entry.isFile()) { + const filePath = join(dir, entry.name); + const stats = statSync(filePath); + if (isTextLike(filePath, stats)) files.push(filePath); + } + } + } + return files; +} + +function countOccurrences(text, token) { + let count = 0; + let start = 0; + while (start < text.length) { + const index = text.indexOf(token, start); + if (index === -1) break; + count += 1; + start = index + token.length; + } + return count; +} + +function spanAround(lineNumber, totalLines, window) { + const radius = Math.max(1, Math.floor(window / 2)); + return { + start: Math.max(1, lineNumber - radius), + end: Math.min(totalLines, lineNumber + radius), + full_file: false + }; +} + +function mergeSpans(spans) { + return spans + .sort( + (a, b) => + a.start - b.start || (a.end ?? Number.MAX_SAFE_INTEGER) - (b.end ?? Number.MAX_SAFE_INTEGER) + ) + .reduce((merged, span) => { + const previous = merged[merged.length - 1]; + if ( + !previous || + previous.full_file || + span.full_file || + previous.end === null || + span.end === null + ) { + merged.push(span); + } else if (span.start <= previous.end + 1) { + previous.end = Math.max(previous.end, span.end); + } else { + merged.push(span); + } + return merged; + }, []); +} + +function buildTrajectory(task, retrieval) { + const predSpans = {}; + for (const item of retrieval.items) { + const file = normalizeRepoPath(task.repo_checkout_path, item.file); + predSpans[file] = mergeSpans([...(predSpans[file] ?? []), ...item.spans]); + } + const predFiles = Object.keys(predSpans).sort(); + return { + instance_id: task.instance_id, + repo_url: task.repo_url, + commit: task.base_commit, + traj_data: { + pred_steps: [{ files: predFiles, spans: predSpans }], + pred_files: predFiles, + pred_spans: predSpans + }, + model_patch: '' + }; +} + +function buildStructuredAnswer(task, retrieval) { + return { + answer: { + diagnosticRetrievalOnly: true, + laneId: retrieval.laneId, + method: retrieval.method, + itemCount: retrieval.items.length + }, + confidence: retrieval.items.length > 0 ? 'medium' : 'low', + evidence: retrieval.items.flatMap((item) => + item.spans.map((span) => ({ + file: normalizeRepoPath(task.repo_checkout_path, item.file), + lineRange: { start: span.start, end: span.end ?? span.start }, + reason: item.reason + })) + ), + filesReferenced: retrieval.items.map((item) => + normalizeRepoPath(task.repo_checkout_path, item.file) + ), + symbolsReferenced: [], + unsupportedClaims: ['retrieval_only_diagnostic_not_task_success'], + readyToEdit: false + }; +} + +function runRawNative(task, options) { + const tokens = tokenize(task.problem_statement); + const files = collectFiles(task.repo_checkout_path); + const scored = []; + const startedAt = Date.now(); + for (const filePath of files) { + let content; + try { + content = readFileSync(filePath, 'utf8'); + } catch { + continue; + } + const relativePath = normalizePath(relative(task.repo_checkout_path, filePath)); + const lowerPath = relativePath.toLowerCase(); + const lines = content.split(/\r?\n/); + let score = 0; + for (const token of tokens) { + score += countOccurrences(lowerPath, token) * 8; + } + let bestLine = 1; + let bestLineScore = 0; + for (let index = 0; index < lines.length; index += 1) { + const lowerLine = lines[index].toLowerCase(); + let lineScore = 0; + for (const token of tokens) lineScore += countOccurrences(lowerLine, token); + if (lineScore > bestLineScore) { + bestLineScore = lineScore; + bestLine = index + 1; + } + score += lineScore; + } + if (score > 0) { + scored.push({ + file: relativePath, + score, + bestLine, + totalLines: lines.length, + reason: `lexical token match: ${tokens.join(', ')}` + }); + } + } + scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file)); + const items = scored.slice(0, options.limit).map((item) => ({ + file: item.file, + score: item.score, + spans: [spanAround(item.bestLine, item.totalLines, options.window)], + reason: item.reason + })); + return { + laneId: 'raw-native', + method: 'deterministic lexical repository scan over problem statement tokens', + status: items.length > 0 ? 'completed' : 'no_answer', + setup: { + setupCommand: 'none', + indexCommand: 'none', + setupStatus: 'not_required', + indexStatus: 'not_required', + setupDurationMs: 0, + indexDurationMs: 0 + }, + trace: { + tokens, + filesScanned: files.length, + scoredFiles: scored.length, + durationMs: Date.now() - startedAt, + topScores: scored.slice(0, options.limit).map(({ file, score, bestLine }) => ({ + file, + score, + bestLine + })) + }, + items + }; +} + +function runCommand(command, args, options) { + const startedAt = Date.now(); + const result = spawnSync(command, args, { + cwd: options.cwd, + env: options.env, + encoding: 'utf8', + input: '', + timeout: options.timeoutMs ?? 120_000 + }); + return { + command, + args, + cwd: options.cwd, + status: typeof result.status === 'number' ? result.status : null, + signal: result.signal ?? null, + error: result.error?.message ?? null, + stdout: result.stdout ?? '', + stderr: result.stderr ?? '', + durationMs: Date.now() - startedAt + }; +} + +function runJCodeMunchMcpCalls(calls, timeoutMs) { + return new Promise((resolve) => { + const startedAt = Date.now(); + const command = 'python'; + const args = [ + '-m', + 'jcodemunch_mcp.server', + 'serve', + '--transport', + 'stdio', + '--log-level', + 'ERROR' + ]; + const child = spawn(command, args, { + cwd: process.cwd(), + env: { ...process.env, JCODEMUNCH_USE_AI_SUMMARIES: 'false' }, + stdio: ['pipe', 'pipe', 'pipe'] + }); + const messages = new Map(); + let stdout = ''; + let stderr = ''; + let lineBuffer = ''; + let settled = false; + const finish = (status, error = null) => { + if (settled) return; + settled = true; + clearTimeout(timer); + if (!child.killed) child.kill(); + resolve({ + command, + args, + cwd: process.cwd(), + status, + error, + stdout, + stderr, + durationMs: Date.now() - startedAt, + messages: calls.map((call, index) => messages.get(index + 2) ?? null) + }); + }; + const send = (message) => { + child.stdin.write(`${JSON.stringify(message)}\n`); + }; + const maybeComplete = () => { + if (calls.every((_, index) => messages.has(index + 2))) finish(0); + }; + const handleMessage = (message) => { + if (message.id === 1) { + send({ jsonrpc: '2.0', method: 'notifications/initialized', params: {} }); + calls.forEach((call, index) => { + send({ + jsonrpc: '2.0', + id: index + 2, + method: 'tools/call', + params: { name: call.name, arguments: call.arguments } + }); + }); + return; + } + if (typeof message.id === 'number' && message.id >= 2) { + messages.set(message.id, message); + maybeComplete(); + } + }; + child.stdout.on('data', (chunk) => { + const text = chunk.toString(); + stdout += text; + lineBuffer += text; + const lines = lineBuffer.split(/\r?\n/); + lineBuffer = lines.pop() ?? ''; + for (const line of lines) { + if (!line.trim()) continue; + try { + handleMessage(JSON.parse(line)); + } catch (error) { + stderr += `\nfailed to parse MCP stdout line: ${error instanceof Error ? error.message : String(error)}`; + } + } + }); + child.stderr.on('data', (chunk) => { + stderr += chunk.toString(); + }); + child.on('error', (error) => finish(null, error.message)); + child.on('close', (status) => { + if (!settled) finish(status); + }); + const timer = setTimeout( + () => finish(null, `jCodeMunch MCP timed out after ${timeoutMs}ms`), + timeoutMs + ); + send({ + jsonrpc: '2.0', + id: 1, + method: 'initialize', + params: { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'contextbench-retrieval-gate', version: '0.0.0' } + } + }); + }); +} + +function codebaseContextBaseCommand() { + const distIndex = resolve('dist/index.js'); + if (existsSync(distIndex)) + return { command: process.execPath, prefixArgs: [distIndex], source: 'local-dist' }; + const npmCommand = process.platform === 'win32' ? 'npm.cmd' : 'npm'; + return { + command: npmCommand, + prefixArgs: ['exec', '--', 'codebase-context'], + source: 'npm-exec' + }; +} + +function parseJsonOutput(commandResult) { + const trimmed = commandResult.stdout.trim(); + if (!trimmed) return { value: null, error: 'empty_stdout' }; + try { + return { value: JSON.parse(trimmed), error: null }; + } catch (error) { + return { value: null, error: error instanceof Error ? error.message : String(error) }; + } +} + +function parseSearchFile(value) { + const normalized = normalizePath(String(value ?? '')); + const rangeMatch = normalized.match(/^(.*):(\d+)-(\d+)$/); + if (rangeMatch) { + return { + file: rangeMatch[1], + span: { start: Number(rangeMatch[2]), end: Number(rangeMatch[3]), full_file: false } + }; + } + const lineMatch = normalized.match(/^(.*):(\d+)$/); + if (lineMatch) { + const line = Number(lineMatch[2]); + return { file: lineMatch[1], span: { start: line, end: line, full_file: false } }; + } + return { file: normalized, span: { start: 1, end: null, full_file: true } }; +} + +function expandSpan(span, window) { + if (span.full_file || span.end === null) return span; + const radius = Math.max(0, Math.floor(window / 2)); + return { + start: Math.max(1, span.start - radius), + end: Math.max(span.end, span.end + radius), + full_file: false + }; +} + +function capSpanToFile(repoRoot, file, span) { + if (span.full_file || span.end === null) return span; + try { + const lineCount = readFileSync(join(repoRoot, file), 'utf8').split(/\r?\n/).length; + return { ...span, end: Math.min(lineCount, span.end) }; + } catch { + return span; + } +} + +function runCodebaseContext(task, options) { + const base = codebaseContextBaseCommand(); + const env = { + ...process.env, + CODEBASE_ROOT: task.repo_checkout_path, + CODEBASE_CONTEXT_ASCII: '1' + }; + const reindex = runCommand(base.command, [...base.prefixArgs, 'reindex', '--json'], { + cwd: process.cwd(), + env, + timeoutMs: options.indexTimeoutMs + }); + if (reindex.status !== 0) { + return { + laneId: 'codebase-context', + method: 'codebase-context CLI reindex/search JSON output', + status: 'index_failed', + setup: { + setupCommand: base.source, + indexCommand: `${base.command} ${[...base.prefixArgs, 'reindex', '--json'].join(' ')}`, + setupStatus: 'completed', + indexStatus: 'index_failed', + setupDurationMs: 0, + indexDurationMs: reindex.durationMs + }, + trace: { commandSource: base.source, reindex, search: null, parseError: null }, + items: [] + }; + } + const searchArgs = [ + ...base.prefixArgs, + 'search', + '--query', + task.problem_statement, + '--intent', + 'explore', + '--limit', + String(options.limit), + '--json' + ]; + const search = runCommand(base.command, searchArgs, { + cwd: process.cwd(), + env, + timeoutMs: options.queryTimeoutMs + }); + const parsed = parseJsonOutput(search); + if (search.status !== 0 || parsed.error) { + return { + laneId: 'codebase-context', + method: 'codebase-context CLI reindex/search JSON output', + status: search.status === 0 ? 'invalid_schema' : 'tool_error', + setup: { + setupCommand: base.source, + indexCommand: `${base.command} ${[...base.prefixArgs, 'reindex', '--json'].join(' ')}`, + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 0, + indexDurationMs: reindex.durationMs + }, + trace: { commandSource: base.source, reindex, search, parseError: parsed.error }, + items: [] + }; + } + const rawResults = Array.isArray(parsed.value?.results) ? parsed.value.results : []; + const itemsByFile = new Map(); + for (const result of rawResults.slice(0, options.limit)) { + const parsedFile = parseSearchFile(result.file); + if (!parsedFile.file) continue; + const existing = itemsByFile.get(parsedFile.file) ?? { + file: parsedFile.file, + score: Number(result.score ?? 0), + spans: [], + reason: result.relevanceReason || result.summary || 'codebase-context search result' + }; + existing.score = Math.max(existing.score, Number(result.score ?? 0)); + existing.spans.push( + capSpanToFile( + task.repo_checkout_path, + parsedFile.file, + expandSpan(parsedFile.span, options.window) + ) + ); + itemsByFile.set(parsedFile.file, existing); + } + const items = [...itemsByFile.values()].sort( + (a, b) => b.score - a.score || a.file.localeCompare(b.file) + ); + return { + laneId: 'codebase-context', + method: 'codebase-context CLI reindex/search JSON output', + status: items.length > 0 ? 'completed' : 'no_answer', + setup: { + setupCommand: base.source, + indexCommand: `${base.command} ${[...base.prefixArgs, 'reindex', '--json'].join(' ')}`, + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 0, + indexDurationMs: reindex.durationMs + }, + trace: { + commandSource: base.source, + reindex, + search, + parseError: null, + searchQuality: parsed.value?.searchQuality ?? null, + totalResults: parsed.value?.totalResults ?? rawResults.length, + rawResultFiles: rawResults.map((result) => result.file) + }, + items + }; +} + +function parseJCodeMunchToolJson(message) { + const text = message?.result?.content?.find?.((part) => part?.type === 'text')?.text; + if (!text) return { value: null, error: 'missing_text_content' }; + try { + return { value: JSON.parse(text), error: null }; + } catch (error) { + return { value: null, error: error instanceof Error ? error.message : String(error) }; + } +} + +async function runJCodeMunch(task, options) { + const indexCall = { + name: 'index_folder', + arguments: { + path: task.repo_checkout_path, + use_ai_summaries: false, + incremental: true, + follow_symlinks: false, + extra_ignore_patterns: ['.codebase-context/**'] + } + }; + const index = await runJCodeMunchMcpCalls([indexCall], options.indexTimeoutMs); + const indexMessage = index.messages[0]; + const indexParsed = parseJCodeMunchToolJson(indexMessage); + const repo = indexParsed.value?.repo; + if (index.status !== 0 || index.error || !repo) { + return { + laneId: 'jcodemunch-repomapper', + method: 'jCodeMunch MCP index_folder plus search_symbols over deterministic problem tokens', + status: 'index_failed', + setup: { + setupCommand: 'python -m jcodemunch_mcp.server --version', + indexCommand: 'MCP index_folder', + setupStatus: 'completed', + indexStatus: 'index_failed', + setupDurationMs: 0, + indexDurationMs: index.durationMs + }, + trace: { index, indexParseError: indexParsed.error, repo: repo ?? null, searches: [] }, + items: [] + }; + } + + const searchCalls = codeGraphContextQueries(task.problem_statement).map((query) => ({ + name: 'search_symbols', + arguments: { + repo, + query, + max_results: options.limit, + detail_level: 'compact', + semantic: false + } + })); + const search = await runJCodeMunchMcpCalls(searchCalls, options.queryTimeoutMs); + const itemsByFile = new Map(); + const searches = search.messages.map((message, index) => { + const parsed = parseJCodeMunchToolJson(message); + const query = searchCalls[index]?.arguments?.query ?? ''; + const results = Array.isArray(parsed.value?.results) ? parsed.value.results : []; + for (const [resultIndex, result] of results.entries()) { + const file = normalizeRepoPath(task.repo_checkout_path, String(result.file ?? '')); + const line = Number(result.line ?? 1); + if (!file || file.startsWith('..') || !Number.isFinite(line)) continue; + const existing = itemsByFile.get(file) ?? { + file, + score: 0, + spans: [], + reason: `jCodeMunch search_symbols match for problem-derived query "${query}"` + }; + existing.score += 1 / (index + 1 + resultIndex / 100); + existing.spans.push(spanAround(line, Number.MAX_SAFE_INTEGER, options.window)); + itemsByFile.set(file, existing); + } + return { query, message, parsed, resultCount: results.length }; + }); + const items = [...itemsByFile.values()] + .map((item) => ({ + ...item, + spans: item.spans.map((span) => capSpanToFile(task.repo_checkout_path, item.file, span)) + })) + .sort((a, b) => b.score - a.score || a.file.localeCompare(b.file)) + .slice(0, options.limit); + return { + laneId: 'jcodemunch-repomapper', + method: 'jCodeMunch MCP index_folder plus search_symbols over deterministic problem tokens', + status: items.length > 0 ? 'completed' : 'no_answer', + setup: { + setupCommand: 'python -m jcodemunch_mcp.server --version', + indexCommand: 'MCP index_folder', + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 0, + indexDurationMs: index.durationMs + }, + trace: { + repo, + index, + indexSummary: indexParsed.value, + search, + queryCount: searches.length, + searches, + rawResultFiles: items.map((item) => item.file) + }, + items + }; +} + +function parseCodeGraphContextTable(stdout) { + const rows = []; + let current = null; + for (const line of String(stdout ?? '').split(/\r?\n/)) { + if (!line.includes('│')) continue; + const parts = line.split('│').slice(1, -1); + if (parts.length < 3) continue; + const name = parts[0].trim(); + const type = parts[1].trim(); + const locationPart = parts[2].trim(); + if (name === 'Name' || type === 'Type' || locationPart === 'Location') continue; + if (name) { + if (current) rows.push(current); + current = { name, type, locationParts: locationPart ? [locationPart] : [] }; + } else if (current && locationPart) { + current.locationParts.push(locationPart); + } + } + if (current) rows.push(current); + return rows + .map((row) => { + const location = row.locationParts.join(''); + const match = location.match(/^(.*):(\d+)$/); + if (!match) return null; + return { + name: row.name, + type: row.type, + file: normalizePath(match[1]), + line: Number(match[2]) + }; + }) + .filter(Boolean); +} + +function codeGraphContextQueries(problemStatement) { + const tokens = tokenize(problemStatement).filter((token) => !token.startsWith('#')); + const rankedTokens = [...tokens].sort((a, b) => b.length - a.length || a.localeCompare(b)); + return [problemStatement.replace(/\s+/g, ' ').trim(), ...rankedTokens].filter(Boolean); +} + +function runCodeGraphContext(task, options) { + const env = { + ...process.env, + PYTHONUTF8: '1', + PYTHONIOENCODING: 'utf-8' + }; + const index = runCommand('python', ['-m', 'codegraphcontext', 'index', task.repo_checkout_path], { + cwd: process.cwd(), + env, + timeoutMs: options.indexTimeoutMs + }); + if (index.status !== 0) { + return { + laneId: 'codegraphcontext', + method: 'CodeGraphContext CLI index plus find content over deterministic problem tokens', + status: 'index_failed', + setup: { + setupCommand: 'python -m codegraphcontext --version', + indexCommand: `python -m codegraphcontext index ${task.repo_checkout_path}`, + setupStatus: 'completed', + indexStatus: 'index_failed', + setupDurationMs: 0, + indexDurationMs: index.durationMs + }, + trace: { index, queries: [], parseError: null }, + items: [] + }; + } + + const queries = []; + const itemsByFile = new Map(); + for (const query of codeGraphContextQueries(task.problem_statement)) { + if (itemsByFile.size >= options.limit) break; + const queryResult = runCommand('python', ['-m', 'codegraphcontext', 'find', 'content', query], { + cwd: process.cwd(), + env, + timeoutMs: options.queryTimeoutMs + }); + const parsedRows = + queryResult.status === 0 + ? parseCodeGraphContextTable(`${queryResult.stdout}\n${queryResult.stderr}`) + : []; + queries.push({ query, result: queryResult, parsedRows }); + for (const [indexInResult, row] of parsedRows.entries()) { + const file = normalizeRepoPath(task.repo_checkout_path, row.file); + if (!file || file.startsWith('..')) continue; + const existing = itemsByFile.get(file) ?? { + file, + score: 0, + spans: [], + reason: `CodeGraphContext content match for problem-derived query "${query}"` + }; + existing.score += 1 / (queries.length + indexInResult / 100); + existing.spans.push(spanAround(row.line, Number.MAX_SAFE_INTEGER, options.window)); + itemsByFile.set(file, existing); + if (itemsByFile.size >= options.limit) break; + } + } + + const items = [...itemsByFile.values()] + .map((item) => ({ + ...item, + spans: item.spans.map((span) => capSpanToFile(task.repo_checkout_path, item.file, span)) + })) + .sort((a, b) => b.score - a.score || a.file.localeCompare(b.file)) + .slice(0, options.limit); + return { + laneId: 'codegraphcontext', + method: 'CodeGraphContext CLI index plus find content over deterministic problem tokens', + status: items.length > 0 ? 'completed' : 'no_answer', + setup: { + setupCommand: 'python -m codegraphcontext --version', + indexCommand: `python -m codegraphcontext index ${task.repo_checkout_path}`, + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 0, + indexDurationMs: index.durationMs + }, + trace: { + index, + queryCount: queries.length, + queries, + rawResultFiles: items.map((item) => item.file) + }, + items + }; +} + +async function runLane(lane, task, options) { + if (lane === 'raw-native') return runRawNative(task, options); + if (lane === 'codebase-context') return runCodebaseContext(task, options); + if (lane === 'jcodemunch-repomapper') return runJCodeMunch(task, options); + if (lane === 'codegraphcontext') return runCodeGraphContext(task, options); + throw new Error(`unsupported retrieval lane: ${lane}`); +} + +function hasOfficialEvaluator(cwd) { + return existsSync(join(cwd, 'contextbench', 'evaluate.py')); +} + +function resolveEvaluatorCwd(args) { + if (args.evaluatorCwd) { + const resolved = resolve(args.evaluatorCwd); + if (!hasOfficialEvaluator(resolved)) { + throw new Error(`--evaluator-cwd does not contain contextbench/evaluate.py: ${resolved}`); + } + return resolved; + } + const moduleCheck = runCommand('python', ['-m', 'contextbench.evaluate', '--help'], { + cwd: process.cwd(), + env: process.env, + timeoutMs: 30_000 + }); + if (moduleCheck.status === 0) return process.cwd(); + const candidates = [ + 'benchmark-runs/contextbench/phase40/evaluator-probe-20260427/ContextBench-official' + ]; + for (const candidate of candidates) { + const resolved = resolve(candidate); + if (hasOfficialEvaluator(resolved)) return resolved; + } + throw new Error( + 'official evaluator unavailable; pass --evaluator-cwd or run the evaluator probe first' + ); +} + +function scoreTrajectory(goldPath, trajectoryPath, outputPath, evaluatorCwd, cachePath) { + const args = [ + '-m', + 'contextbench.evaluate', + '--gold', + goldPath, + '--pred', + trajectoryPath, + '--cache', + cachePath, + '--out', + outputPath + ]; + const result = runCommand('python', args, { + cwd: evaluatorCwd, + env: process.env, + timeoutMs: 120_000 + }); + let metrics = null; + if (result.status === 0 && existsSync(outputPath)) { + const firstLine = readFileSync(outputPath, 'utf8').trim().split('\n')[0]; + if (firstLine) { + try { + metrics = JSON.parse(firstLine); + } catch { + metrics = null; + } + } + } + return { + status: result.status === 0 ? 'completed' : 'judge_failed', + mode: result.status === 0 ? 'official_evaluator' : 'official_evaluator_failed', + claimBearing: false, + retrievalOnly: true, + command: `python ${args.join(' ')}`, + evaluatorCwd, + exitStatus: result.status, + stdout: result.stdout, + stderr: result.stderr, + outputPath, + metrics + }; +} + +function buildRunPaths(sessionRoot, runId) { + const runDir = join(sessionRoot, 'runs', runId); + return { + runDir, + prompt: join(runDir, 'retrieval-query.txt'), + setupIndex: join(runDir, 'setup-index.json'), + rawTrace: join(runDir, 'raw-trace.json'), + structuredAnswer: join(runDir, 'structured-answer.json'), + trajectory: join(runDir, 'trajectory.json'), + score: join(runDir, 'score.json'), + officialResults: join(runDir, 'official-results.jsonl') + }; +} + +function appendManifest(sessionRoot, row) { + appendFileSync(join(sessionRoot, 'run-manifest.jsonl'), `${JSON.stringify(row)}\n`, 'utf8'); +} + +function artifactHashIfPresent(filePath) { + return existsSync(filePath) ? sha256File(filePath) : null; +} + +function writeSessionScratchpad(sessionRoot, task, args) { + const scratchpadPath = join(sessionRoot, 'RETRIEVAL-GATE-SCRATCHPAD.json'); + const value = { + createdAt: new Date().toISOString(), + claimBearing: false, + evidenceType: 'retrieval_only_diagnostic', + claimLimits: [ + 'No agent patch was produced.', + 'No tests were run in the target repository.', + 'Official evaluator scores measure retrieved context overlap only.', + 'Scorer-only gold is used after trajectories are materialized, never during retrieval.' + ], + task: { + instance_id: task.instance_id, + repo_url: task.repo_url, + base_commit: task.base_commit, + repo_checkout_path: task.repo_checkout_path, + problem_statement_hash: task.problem_statement_hash + }, + args: { + lanes: args.lanes, + limit: args.limit, + window: args.window, + repeat: args.repeat, + score: args.score, + gold: args.score ? resolve(args.gold) : null + } + }; + writeJson(scratchpadPath, value); +} + +async function main() { + const args = parseArgs(process.argv.slice(2)); + if (args.help) { + help(); + return; + } + if (!Number.isInteger(args.limit) || args.limit < 1) + throw new Error('--limit must be a positive integer'); + if (!Number.isInteger(args.window) || args.window < 1) + throw new Error('--window must be a positive integer'); + if (!Number.isInteger(args.repeat) || args.repeat < 1) + throw new Error('--repeat must be a positive integer'); + if (!Number.isInteger(args.indexTimeoutMs) || args.indexTimeoutMs < 1) + throw new Error('--index-timeout-ms must be a positive integer'); + if (!Number.isInteger(args.queryTimeoutMs) || args.queryTimeoutMs < 1) + throw new Error('--query-timeout-ms must be a positive integer'); + const sessionRoot = ensurePhase40Out(args.out); + const task = loadTask(args.taskPayloads, args.taskId); + const goldPath = resolve(args.gold); + if (args.score && !existsSync(goldPath)) throw new Error(`gold file missing: ${goldPath}`); + const evaluatorCwd = args.score ? resolveEvaluatorCwd(args) : null; + writeSessionScratchpad(sessionRoot, task, args); + + const rows = []; + for (const lane of args.lanes) { + const runId = sanitize(`${lane}-${task.instance_id}-${args.repeat}-retrieval`); + const paths = buildRunPaths(sessionRoot, runId); + if (existsSync(paths.runDir)) + throw new Error(`run directory already exists; refusing overwrite: ${paths.runDir}`); + const startedAt = new Date().toISOString(); + const retrieval = await runLane(lane, task, { + limit: args.limit, + window: args.window, + indexTimeoutMs: args.indexTimeoutMs, + queryTimeoutMs: args.queryTimeoutMs + }); + const trajectory = buildTrajectory(task, retrieval); + const answer = buildStructuredAnswer(task, retrieval); + writeText(paths.prompt, task.problem_statement); + writeJson(paths.setupIndex, retrieval.setup); + writeJson(paths.rawTrace, { + laneId: lane, + claimBearing: false, + retrievalOnly: true, + notAgentTaskSuccess: true, + workingDirectory: task.repo_checkout_path, + task: { + instance_id: task.instance_id, + repo_url: task.repo_url, + base_commit: task.base_commit, + problem_statement_hash: task.problem_statement_hash + }, + method: retrieval.method, + status: retrieval.status, + trace: retrieval.trace, + retrievedItems: retrieval.items, + scriptedAgentDecisions: true, + scorerGoldReadDuringRetrieval: false + }); + writeJson(paths.structuredAnswer, answer); + writeJson(paths.trajectory, trajectory); + const score = args.score + ? scoreTrajectory( + goldPath, + paths.trajectory, + paths.officialResults, + evaluatorCwd, + join(sessionRoot, 'score-cache') + ) + : { + status: 'not_scored', + mode: 'not_requested', + claimBearing: false, + retrievalOnly: true, + fallbackReason: 'run_without_score_flag' + }; + writeJson(paths.score, score); + const completedAt = new Date().toISOString(); + const row = { + run_id: runId, + lane_id: lane, + task_id: task.instance_id, + repeat_index: args.repeat, + status: score.status === 'completed' ? retrieval.status : score.status, + started_at: startedAt, + completed_at: completedAt, + raw_trace_path: paths.rawTrace, + structured_answer_path: paths.structuredAnswer, + trajectory_path: paths.trajectory, + score_path: paths.score, + setup_index_path: paths.setupIndex, + prompt_path: paths.prompt, + setupIndex: retrieval.setup, + taskExecution: { + executor: 'retrieval-script', + retrievalOnly: true, + taskWallTimeMs: new Date(completedAt).getTime() - new Date(startedAt).getTime() + }, + scoring: { + officialEvaluatorFirst: Boolean(args.score), + claimBearing: false, + retrievalOnly: true, + officialResultsPath: args.score ? paths.officialResults : null + }, + hashes: { + prompt: sha256Text(task.problem_statement), + rawTrace: artifactHashIfPresent(paths.rawTrace), + structuredAnswer: artifactHashIfPresent(paths.structuredAnswer), + trajectory: artifactHashIfPresent(paths.trajectory), + score: artifactHashIfPresent(paths.score), + officialResults: artifactHashIfPresent(paths.officialResults) + } + }; + appendManifest(sessionRoot, row); + rows.push(row); + } + writeJson(join(sessionRoot, 'RETRIEVAL-GATE-SUMMARY.json'), { + completedAt: new Date().toISOString(), + claimBearing: false, + retrievalOnly: true, + taskId: task.instance_id, + rows + }); + console.log(`retrieval gate wrote ${join(sessionRoot, 'RETRIEVAL-GATE-SUMMARY.json')}`); +} + +try { + await main(); +} catch (error) { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +} diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs new file mode 100644 index 0000000..11285f4 --- /dev/null +++ b/scripts/contextbench-runner.mjs @@ -0,0 +1,3586 @@ +#!/usr/bin/env node +import { createHash } from 'node:crypto'; +import { execFileSync, spawnSync } from 'node:child_process'; +import { + appendFileSync, + existsSync, + mkdirSync, + readdirSync, + readFileSync, + statSync, + writeFileSync +} from 'node:fs'; +import { dirname, isAbsolute, join, relative, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import os from 'node:os'; + +const RUNNER_SOURCE_PATH = fileURLToPath(import.meta.url); + +const FIXTURES = { + protocol: 'tests/fixtures/contextbench-benchmark-protocol.json', + lanes: 'tests/fixtures/contextbench-lanes.json', + corrections: 'tests/fixtures/contextbench-corrections.json', + manifest: 'tests/fixtures/contextbench-task-manifest.json', + laneToolCards: 'tests/fixtures/contextbench-lane-tool-cards.json', + laneSetupEvidence: 'tests/fixtures/contextbench-lane-setup-evidence.json', + codebaseContextBaselineArms: 'tests/fixtures/contextbench-codebase-context-baseline-arms.json' +}; + +const TERMINAL_LANE_SETUP_STATUSES = new Set([ + 'ready_for_phase40', + 'setup_failed', + 'index_failed', + 'tool_error', + 'invasive_setup_blocked' +]); + +const CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS = [ + 'answer', + 'confidence', + 'evidence', + 'filesReferenced', + 'symbolsReferenced', + 'unsupportedClaims', + 'readyToEdit' +]; + +const CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA = { + type: 'object', + additionalProperties: false, + required: CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS, + properties: { + answer: { type: ['object', 'array', 'string', 'number', 'boolean', 'null'] }, + confidence: { type: 'string', enum: ['low', 'medium', 'high'] }, + evidence: { + type: 'array', + items: { + type: 'object', + additionalProperties: false, + required: ['file', 'lineRange', 'reason'], + properties: { + file: { type: 'string', minLength: 1 }, + lineRange: { + type: 'object', + additionalProperties: false, + required: ['start', 'end'], + properties: { + start: { type: 'integer', minimum: 1 }, + end: { type: 'integer', minimum: 1 } + } + }, + reason: { type: 'string', minLength: 1 } + } + } + }, + filesReferenced: { type: 'array', items: { type: 'string' } }, + symbolsReferenced: { type: 'array', items: { type: 'string' } }, + unsupportedClaims: { type: 'array', items: { type: 'string' } }, + readyToEdit: { type: 'boolean' } + } +}; + +const EVIDENCE_REFERENCE_FIELDS = ['file', 'lineRange', 'reason']; +const LINE_RANGE_FIELDS = ['start', 'end']; + +function diagnosticFallbackScoring(fixtures, fallbackReason, extra = {}) { + return { + officialEvaluatorFirst: false, + officialEvaluatorAttempted: false, + officialEvaluatorInvoked: false, + command: fixtures.protocol.benchmarkTarget.officialEvaluatorCommand, + claimBearing: false, + fallbackReason, + ...extra + }; +} + +function officialEvaluatorCommandParts() { + const override = process.env.CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND; + if (!override) return { command: 'python', prefixArgs: ['-m', 'contextbench.evaluate'] }; + let parts; + try { + parts = JSON.parse(override); + } catch { + throw new Error('CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND must be a JSON array'); + } + if ( + !Array.isArray(parts) || + parts.length === 0 || + parts.some((part) => typeof part !== 'string') + ) { + throw new Error('CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND must be a non-empty JSON string array'); + } + return { command: parts[0], prefixArgs: parts.slice(1) }; +} + +const BLOCKED_LANE_SETUP_STATUSES = new Set([ + 'setup_failed', + 'index_failed', + 'tool_error', + 'invasive_setup_blocked' +]); + +function help() { + console.log(`ContextBench Phase 38/39/40 runner + +Usage: + node scripts/contextbench-runner.mjs --help + node scripts/contextbench-runner.mjs --validate-fixtures + node scripts/contextbench-runner.mjs --validate-lane-setup + node scripts/contextbench-runner.mjs --baseline-snapshot --out benchmark-runs/contextbench/phase40/ + node scripts/contextbench-runner.mjs --baseline-snapshot --out benchmark-runs/contextbench/phase41/ + node scripts/contextbench-runner.mjs --baseline-run --session benchmark-runs/contextbench/phase40/ --executor fake --lane --task-id --repeat + node scripts/contextbench-runner.mjs --baseline-run --session benchmark-runs/contextbench/phase41/ --executor claude --task-payloads --lane --task-id --repeat + node scripts/contextbench-runner.mjs --baseline-refresh --session benchmark-runs/contextbench/phase41/ + node scripts/contextbench-runner.mjs --baseline-validate --session benchmark-runs/contextbench/phase41/ + node scripts/contextbench-runner.mjs --baseline-seal --session benchmark-runs/contextbench/phase41/ + node scripts/contextbench-runner.mjs --phase42-verify --session benchmark-runs/contextbench/phase41/ [--out report.json] [--quiet] + node scripts/contextbench-runner.mjs --setup-index-measure --session benchmark-runs/contextbench/phase41/ --lane raw-native + node scripts/contextbench-runner.mjs --setup-index-import --session benchmark-runs/contextbench/phase41/ --lane --input setup-index.json + node scripts/contextbench-runner.mjs --baseline-validate-arms tests/fixtures/contextbench-codebase-context-baseline-arms.json + node scripts/contextbench-runner.mjs --print-claude-args --model haiku + node scripts/contextbench-runner.mjs --print-answer-schema + node scripts/contextbench-runner.mjs --dry-run --executor fake --lane --task-id --repeat --out + node scripts/contextbench-runner.mjs --score-probe --out + +Modes: + --validate-fixtures Validate frozen protocol, task manifest, lane governance, and lane tool cards. + --validate-lane-setup Validate Phase 39 setup/index readiness or terminal blocker evidence only. + --baseline-snapshot Capture dirty-worktree state before any Phase 40 baseline attempt. + --baseline-run Write a baseline attempt row and artifacts. Fake executor is test-only; live executors require task payloads and materialized checkouts. + --baseline-refresh Re-hash an interrupted Phase 40/41 session without running live agents. + --baseline-validate Validate a Phase 40/41 session root, hashes, reservations, rows, and artifact paths. + --baseline-seal Seal only after terminal evidence and the Phase 42 evidence gate both pass. + --phase42-verify Read-only Phase 42 evidence gate over a Phase 40/41 session; exits non-zero unless claim-pass. + --quiet With --phase42-verify, write only the concise pass/fail line to stdout. + --setup-index-measure Capture safe setup/index measurement artifacts before task execution. + --setup-index-import Import pre-captured setup/index evidence without running setup commands. + --baseline-validate-arms Validate diagnostic codebase-context baseline arm metadata. + --print-claude-args Print the Claude CLI args used for schema-gated live attempts. + --print-answer-schema Print the structured answer JSON Schema used by live attempts. + --dry-run Write non-claim-bearing fake-executor smoke artifacts and one append-only manifest row. + --score-probe Write a synthetic non-claim-bearing diagnostic fallback artifact without live Claude. + +Phase 39 boundary: + Lane setup validation and probes are readiness/blocker evidence only, always claimBearing=false. + Phase 40 owns dirty-worktree baseline capture, task x repeat execution, and non-claim-bearing baseline artifacts while claimAllowed=false. + +Anti-scripting boundary: + This runner standardizes prompt, lane card, budgets, traces, structured answer JSON, trajectory, and score artifacts. + It must not script agent decisions, file selection, query rewrites, answer content, or evidence selection. +`); +} + +function parseArgs(argv) { + const args = { repeat: 1 }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (arg === '--help' || arg === '-h') args.help = true; + else if (arg === '--validate-fixtures') args.validateFixtures = true; + else if (arg === '--validate-lane-setup') args.validateLaneSetup = true; + else if (arg === '--baseline-snapshot') args.baselineSnapshot = true; + else if (arg === '--baseline-run') args.baselineRun = true; + else if (arg === '--baseline-refresh') args.baselineRefresh = true; + else if (arg === '--baseline-validate') args.baselineValidate = true; + else if (arg === '--baseline-seal') args.baselineSeal = true; + else if (arg === '--phase42-verify') args.phase42Verify = true; + else if (arg === '--setup-index-measure') args.setupIndexMeasure = true; + else if (arg === '--setup-index-import') args.setupIndexImport = true; + else if (arg === '--quiet') args.quiet = true; + else if (arg === '--baseline-validate-arms') args.baselineValidateArms = argv[++i] ?? ''; + else if (arg === '--baseline-run-codebase-context-arms') + args.baselineRunCodebaseContextArms = true; + else if (arg === '--print-claude-args') args.printClaudeArgs = true; + else if (arg === '--print-answer-schema') args.printAnswerSchema = true; + else if (arg === '--dry-run') args.dryRun = true; + else if (arg === '--score-probe') args.scoreProbe = true; + else if (arg === '--executor') args.executor = argv[++i] ?? ''; + else if (arg === '--model') args.model = argv[++i] ?? ''; + else if (arg === '--lane') args.lane = argv[++i] ?? ''; + else if (arg === '--task-id') args.taskId = argv[++i] ?? ''; + else if (arg === '--repeat') args.repeat = Number(argv[++i] ?? '1'); + else if (arg === '--repeats') args.repeats = Number(argv[++i] ?? '1'); + else if (arg === '--max-attempts') args.maxAttempts = Number(argv[++i] ?? '0'); + else if (arg === '--timeout-ms') args.timeoutMs = Number(argv[++i] ?? '0'); + else if (arg === '--task-payloads') args.taskPayloads = argv[++i] ?? ''; + else if (arg === '--input') args.input = argv[++i] ?? ''; + else if (arg === '--fake-answer-mode') args.fakeAnswerMode = argv[++i] ?? 'valid'; + else if (arg === '--all-ready-lanes') args.allReadyLanes = true; + else if (arg === '--out') args.out = argv[++i] ?? ''; + else if (arg === '--session') args.session = argv[++i] ?? ''; + else throw new Error(`Unknown argument: ${arg}`); + } + return args; +} + +function stableStringify(value) { + if (value === null || typeof value !== 'object') return JSON.stringify(value); + if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(',')}]`; + const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b)); + return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(',')}}`; +} + +function sha256(value) { + return `sha256:${createHash('sha256').update(value, 'utf8').digest('hex')}`; +} + +function canonicalizeDatasetField(value) { + if (value === undefined) return 'undefined'; + if (value === null) return 'null'; + if (typeof value !== 'string') return stableStringify(value).replace(/\r\n?/g, '\n'); + const normalized = value.replace(/\r\n?/g, '\n'); + const trimmed = normalized.trim(); + if (trimmed.startsWith('{') || trimmed.startsWith('[')) { + try { + return stableStringify(JSON.parse(trimmed)); + } catch { + return normalized; + } + } + return normalized; +} + +function sha256Buffer(value) { + return `sha256:${createHash('sha256').update(value).digest('hex')}`; +} + +function hashFile(filePath) { + return sha256Buffer(readFileSync(filePath)); +} + +function runnerSourceHash() { + return hashFile(RUNNER_SOURCE_PATH); +} + +function hashObject(value) { + return sha256(stableStringify(value)); +} + +function readJson(filePath) { + return JSON.parse(readFileSync(filePath, 'utf8')); +} + +function readTaskPayloads(payloadPath) { + if (!payloadPath) return new Map(); + const absolutePath = isAbsolute(payloadPath) ? payloadPath : resolve(process.cwd(), payloadPath); + const payload = readJson(absolutePath); + const entries = Array.isArray(payload?.tasks) + ? payload.tasks + : Object.entries(payload?.tasksById ?? payload ?? {}).map(([instanceId, value]) => ({ + instance_id: instanceId, + ...value + })); + return new Map( + entries + .filter((entry) => entry && typeof entry.instance_id === 'string') + .map((entry) => [entry.instance_id, entry]) + ); +} + +function executorCommandIsOverridden(executor) { + if (executor === 'claude') return Boolean(process.env.CONTEXTBENCH_CLAUDE_COMMAND); + if (executor === 'codex') return Boolean(process.env.CONTEXTBENCH_CODEX_COMMAND); + if (executor === 'gemini') return Boolean(process.env.CONTEXTBENCH_GEMINI_COMMAND); + if (executor === 'opencode') return Boolean(process.env.CONTEXTBENCH_OPENCODE_COMMAND); + return false; +} + +function gitOutput(cwd, args) { + try { + return execFileSync('git', ['-c', 'core.longpaths=true', '-c', 'core.autocrlf=false', ...args], { + cwd, + encoding: 'utf8', + input: '', + stdio: ['pipe', 'pipe', 'ignore'] + }).trim(); + } catch { + return null; + } +} + +function resolveTaskContext(task, payloads, executor) { + if (executor === 'fake') return { materialized: false, errors: [] }; + const payload = payloads.get(task.instance_id); + const errors = []; + if (!payload) errors.push('missing_task_payload'); + const problemStatement = + typeof payload?.problem_statement === 'string' ? payload.problem_statement : ''; + if (!problemStatement.trim()) errors.push('missing_problem_statement'); + const repoCheckoutPath = + typeof payload?.repo_checkout_path === 'string' ? payload.repo_checkout_path : ''; + if (!repoCheckoutPath.trim()) errors.push('missing_repo_checkout_path'); + const absoluteCheckoutPath = repoCheckoutPath + ? isAbsolute(repoCheckoutPath) + ? repoCheckoutPath + : resolve(process.cwd(), repoCheckoutPath) + : ''; + if (absoluteCheckoutPath && !existsSync(absoluteCheckoutPath)) + errors.push('repo_checkout_missing'); + const actualHead = absoluteCheckoutPath + ? gitOutput(absoluteCheckoutPath, ['rev-parse', 'HEAD']) + : null; + const statusShort = absoluteCheckoutPath + ? gitOutput(absoluteCheckoutPath, ['status', '--short']) + : null; + const remoteUrl = absoluteCheckoutPath + ? gitOutput(absoluteCheckoutPath, ['remote', 'get-url', 'origin']) + : null; + if (absoluteCheckoutPath && !actualHead) errors.push('repo_checkout_not_git'); + if (actualHead && statusShort) errors.push('repo_checkout_dirty'); + const problemStatementHash = problemStatement + ? sha256(canonicalizeDatasetField(problemStatement)) + : null; + const overridden = executorCommandIsOverridden(executor); + const problemStatementHashVerified = problemStatementHash === task.problem_statement_hash; + const baseCommitVerified = actualHead === task.base_commit; + if (!overridden && problemStatement && !problemStatementHashVerified) + errors.push('problem_statement_hash_mismatch'); + if (!overridden && actualHead && !baseCommitVerified) errors.push('base_commit_mismatch'); + return { + materialized: errors.length === 0, + errors, + problemStatement, + problemStatementHash, + problemStatementHashVerified, + repoCheckoutPath: absoluteCheckoutPath || null, + actualHead, + statusShort, + baseCommitVerified, + remoteUrl, + verificationStrict: !overridden + }; +} + +function writeJson(filePath, value) { + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8'); +} + +function artifactEntry(filePath, rootDir) { + const stats = statSync(filePath); + return { + path: normalizePath(relative(rootDir, filePath)), + hash: hashFile(filePath), + bytes: stats.size + }; +} + +function writeTextArtifact(filePath, value) { + mkdirSync(dirname(filePath), { recursive: true }); + writeFileSync(filePath, value, 'utf8'); +} + +function sanitize(value) { + return value + .replace(/[^a-zA-Z0-9._-]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 160); +} + +function loadFixtures() { + return { + protocol: readJson(FIXTURES.protocol), + lanes: readJson(FIXTURES.lanes), + corrections: readJson(FIXTURES.corrections), + manifest: readJson(FIXTURES.manifest), + laneToolCards: readJson(FIXTURES.laneToolCards), + laneSetupEvidence: readJson(FIXTURES.laneSetupEvidence) + }; +} + +function hasPendingPhase39Placeholder(card) { + return [card.setupCommand, card.indexCommand, card.queryCommand, card.versionCommand].some( + (command) => String(command).toLowerCase().includes('pending phase 39') + ); +} + +function validateCommandEvidence(record, errors) { + const commandKinds = new Set(record.commands?.map((command) => command.kind)); + for (const kind of ['setup', 'index', 'query', 'version']) { + if (!commandKinds.has(kind)) + errors.push(`lane ${record.laneId} missing ${kind} command evidence`); + } + for (const command of record.commands ?? []) { + if (!command.command || !command.cwd || !command.status) { + errors.push( + `lane ${record.laneId} has incomplete ${command.kind ?? 'unknown'} command evidence` + ); + } + if ( + command.durationMs !== null && + (!Number.isFinite(command.durationMs) || command.durationMs < 0) + ) { + errors.push(`lane ${record.laneId} has invalid ${command.kind} duration`); + } + } +} + +function validateTerminalBlockedEvidence(record, errors) { + if (!record.logReference && !(record.commands ?? []).some((command) => command.outputHash)) { + errors.push( + `lane ${record.laneId} blocked/failed evidence needs a log reference or output hash` + ); + } + if (!record.nextHumanAction || record.nextHumanAction.length < 20) { + errors.push(`lane ${record.laneId} blocked/failed evidence needs next human action`); + } + const hasBlockedCommand = (record.commands ?? []).some((command) => + ['blocked', 'failed'].includes(command.status) + ); + if (!hasBlockedCommand) + errors.push(`lane ${record.laneId} blocked/failed evidence needs blocked or failed command`); +} + +function validateLaneSetupEvidence(fixtures = loadFixtures()) { + const errors = []; + if (fixtures.laneSetupEvidence.claimBearing !== false) + errors.push('lane setup evidence must be non-claim-bearing'); + if ( + !String(fixtures.laneSetupEvidence.generatedOutputsPolicy ?? '').includes( + 'not Phase 40 baseline artifacts' + ) + ) { + errors.push( + 'lane setup evidence must keep generated outputs outside Phase 40 baseline artifacts' + ); + } + + const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card])); + const evidenceByLane = new Map( + fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record]) + ); + + for (const lane of fixtures.lanes.lanes) { + const card = cardsByLane.get(lane.laneId); + const record = evidenceByLane.get(lane.laneId); + if (!card) { + errors.push(`missing lane tool card for ${lane.laneId}`); + continue; + } + if (!record) { + errors.push(`missing lane setup evidence for ${lane.laneId}`); + continue; + } + if (record.readinessStatus === 'pending') errors.push(`lane ${lane.laneId} remains pending`); + if (!TERMINAL_LANE_SETUP_STATUSES.has(record.readinessStatus)) { + errors.push(`lane ${lane.laneId} has non-terminal setup status ${record.readinessStatus}`); + } + if (card.phase39Status !== record.readinessStatus) { + errors.push(`lane ${lane.laneId} card/evidence status mismatch`); + } + if ( + hasPendingPhase39Placeholder(card) && + !BLOCKED_LANE_SETUP_STATUSES.has(record.readinessStatus) + ) { + errors.push( + `lane ${lane.laneId} has unresolved pending Phase 39 command without terminal blocker evidence` + ); + } + if (hasPendingPhase39Placeholder(card)) + errors.push(`lane ${lane.laneId} still has pending Phase 39 command text`); + if (record.claimBearing !== false) + errors.push(`lane ${lane.laneId} setup evidence must be non-claim-bearing`); + if ( + lane.laneId !== 'raw-native' && + (card.contextTools.length !== 1 || card.allowedTools.length !== 1) + ) { + errors.push(`lane ${lane.laneId} must expose exactly one context tool`); + } + if (lane.laneId !== 'raw-native') { + for (const nativeTool of ['native-read', 'native-search', 'native-shell-readonly']) { + if (!card.disallowedTools.includes(nativeTool)) + errors.push(`lane ${lane.laneId} must disallow ${nativeTool}`); + } + } + if (card.setupCostReportedSeparately !== true || card.indexCostReportedSeparately !== true) { + errors.push(`lane ${lane.laneId} must separate setup/index cost`); + } + if ('taskWallTimeMs' in record) + errors.push(`lane ${lane.laneId} setup evidence must not include task wall time`); + if ( + record.setupDurationMs !== null && + (!Number.isFinite(record.setupDurationMs) || record.setupDurationMs < 0) + ) { + errors.push(`lane ${lane.laneId} has invalid setup duration`); + } + if ( + record.indexDurationMs !== null && + (!Number.isFinite(record.indexDurationMs) || record.indexDurationMs < 0) + ) { + errors.push(`lane ${lane.laneId} has invalid index duration`); + } + validateCommandEvidence(record, errors); + if (BLOCKED_LANE_SETUP_STATUSES.has(record.readinessStatus)) + validateTerminalBlockedEvidence(record, errors); + } + + if (errors.length > 0) throw new Error(`lane setup validation failed:\n- ${errors.join('\n- ')}`); + return fixtures; +} + +function validateFixtures() { + const fixtures = loadFixtures(); + const errors = []; + const manifestWithoutHash = { ...fixtures.manifest }; + delete manifestWithoutHash.manifest_hash; + if (fixtures.manifest.manifest_hash !== hashObject(manifestWithoutHash)) + errors.push('task manifest hash mismatch'); + if (fixtures.manifest.tasks.length !== 20) + errors.push('task manifest must contain exactly 20 tasks'); + if (fixtures.protocol.claimAllowed !== false) + errors.push('protocol claimAllowed must remain false'); + if (!fixtures.protocol.benchmarkTarget.officialEvaluatorFirst) + errors.push('official evaluator must be first'); + if (!fixtures.protocol.budgets.setupAndIndexingReportedSeparately) + errors.push('setup/indexing must be separate'); + + const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card])); + for (const laneId of fixtures.lanes.broadClaimLaneSet) { + if (!cardsByLane.has(laneId)) errors.push(`missing lane tool card for ${laneId}`); + } + for (const lane of fixtures.lanes.lanes) { + const card = cardsByLane.get(lane.laneId); + if (!card) continue; + for (const field of fixtures.lanes.laneToolCardRequiredFields) { + if (card[field] === undefined || card[field] === '') + errors.push(`lane ${lane.laneId} missing ${field}`); + } + if (card.setupCostReportedSeparately !== true || card.indexCostReportedSeparately !== true) { + errors.push(`lane ${lane.laneId} must separate setup/index cost`); + } + if (card.disallowedTools.includes(lane.contextTool)) + errors.push(`lane ${lane.laneId} disallows its own context tool`); + if (lane.laneId !== 'raw-native' && card.contextTools.length !== 1) + errors.push(`lane ${lane.laneId} must expose one context tool`); + if (lane.laneId !== 'raw-native' && card.allowedTools.length !== 1) + errors.push(`lane ${lane.laneId} must allow only its context tool`); + if (lane.phase36Status === 'deferred_to_phase39' && card.executableInPhase38) { + errors.push(`lane ${lane.laneId} must remain pending Phase 39`); + } + } + for (const status of fixtures.protocol.failureTaxonomy) { + if (!fixtures.protocol.runManifestSchema.terminalStatuses.includes(status)) { + errors.push(`failure status ${status} missing from terminal statuses`); + } + } + if (errors.length > 0) throw new Error(`fixture validation failed:\n- ${errors.join('\n- ')}`); + validateLaneSetupEvidence(fixtures); + return fixtures; +} + +function normalizePath(filePath) { + return filePath.replace(/\\/g, '/').replace(/^\.\//, '').replace(/^\/+/, ''); +} + +function isPathInside(parentPath, candidatePath) { + const relativePath = relative(parentPath, candidatePath); + return relativePath === '' || (!relativePath.startsWith('..') && !isAbsolute(relativePath)); +} + +function buildTrajectory(task, answer) { + const spans = {}; + const files = new Set(); + for (const evidence of answer.evidence) { + const file = normalizePath(evidence.file); + files.add(file); + spans[file] = [ + ...(spans[file] ?? []), + { start: evidence.lineRange.start, end: evidence.lineRange.end, full_file: false } + ]; + } + for (const fileRef of answer.filesReferenced) { + const file = normalizePath(fileRef); + files.add(file); + if (!spans[file]) spans[file] = [{ start: 1, end: null, full_file: true }]; + } + const predFiles = [...files].sort(); + return { + instance_id: task.instance_id, + repo_url: task.repo_url, + commit: task.base_commit, + traj_data: { + pred_steps: [{ files: predFiles, spans }], + pred_files: predFiles, + pred_spans: spans + }, + model_patch: '' + }; +} + +function baselineSessionPhase(sessionRoot) { + const resolved = resolve(sessionRoot); + const normalized = normalizePath(resolved); + const match = normalized.match(/\/benchmark-runs\/contextbench\/phase(40|41)\//); + if (!match) { + throw new Error( + 'Phase 40/41 baseline artifacts must be written under benchmark-runs/contextbench/phase40/ or benchmark-runs/contextbench/phase41/' + ); + } + if (normalized.includes('/outputs/')) { + throw new Error('Phase 40/41 baseline artifacts must not be written under outputs/'); + } + return Number(match[1]); +} + +function ensureBaselineSessionRoot(sessionRoot) { + const resolved = resolve(sessionRoot); + baselineSessionPhase(resolved); + return resolved; +} + +function commandLabel(command, args = []) { + return [command, ...args].join(' '); +} + +function safeExec(command, args = []) { + try { + return execFileSync(command, args, { + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'pipe'] + }).trim(); + } catch (error) { + const stderr = + error && typeof error === 'object' && 'stderr' in error ? String(error.stderr) : ''; + return stderr.trim() || 'unavailable'; + } +} + +function captureCommand(command, args, cwd, logsDir, label) { + const startedAt = Date.now(); + const result = spawnSync(command, args, { cwd, encoding: 'utf8', input: '' }); + const durationMs = Date.now() - startedAt; + const stdoutPath = join(logsDir, `${label}.stdout.log`); + const stderrPath = join(logsDir, `${label}.stderr.log`); + writeTextArtifact(stdoutPath, result.stdout ?? ''); + writeTextArtifact(stderrPath, result.stderr ?? ''); + return { + command: commandLabel(command, args), + cwd, + exitCode: typeof result.status === 'number' ? result.status : null, + durationMs, + stdoutPath, + stderrPath, + outputHash: sha256(`${result.stdout ?? ''}\n${result.stderr ?? ''}`) + }; +} + +function fixtureHashes() { + return Object.fromEntries( + Object.entries(FIXTURES) + .filter(([, filePath]) => existsSync(filePath)) + .map(([name, filePath]) => [name, hashFile(filePath)]) + ); +} + +function redactedEnvVarNames() { + return Object.keys(process.env) + .filter((name) => /TOKEN|KEY|SECRET|PASSWORD|AUTH|OPENAI|ANTHROPIC|CLAUDE/i.test(name)) + .sort(); +} + +function versionSnapshot() { + return { + os: `${os.platform()} ${os.release()}`, + arch: os.arch(), + shell: process.env.SHELL ?? process.env.ComSpec ?? 'unknown', + node: process.version, + npm: safeExec('npm', ['--version']), + pnpm: safeExec('pnpm', ['--version']), + git: safeExec('git', ['--version']), + python: safeExec('python', ['--version']), + uv: safeExec('uv', ['--version']), + claude: safeExec('claude', ['--version']) + }; +} + +function listFilesRecursive(rootDir) { + if (!existsSync(rootDir)) return []; + const entries = []; + for (const name of readdirSync(rootDir)) { + const filePath = join(rootDir, name); + const stats = statSync(filePath); + if (stats.isDirectory()) entries.push(...listFilesRecursive(filePath)); + else entries.push(filePath); + } + return entries; +} + +function shouldExcludeUntracked(filePath, bytes) { + const normalized = normalizePath(filePath); + if (normalized.startsWith('benchmark-runs/')) return 'generated_phase40_or_benchmark_output'; + if (normalized.startsWith('outputs/')) return 'generated_output_path'; + if (normalized.startsWith('node_modules/') || normalized.includes('/node_modules/')) + return 'dependency_cache'; + if (normalized.startsWith('.pnpm-store/') || normalized.includes('/.pnpm-store/')) + return 'dependency_cache'; + if (normalized.startsWith('.git/') || normalized.includes('/.git/')) return 'git_internal'; + if (normalized.startsWith('.playwright-mcp/') || normalized.includes('/.playwright-mcp/')) + return 'tool_cache'; + if (bytes > 256 * 1024) return 'large_untracked_file'; + return null; +} + +function parseUntrackedFromStatus(statusText) { + return statusText + .split('\n') + .filter((line) => line.startsWith('? ')) + .map((line) => line.slice(2).trim()) + .filter(Boolean); +} + +function captureUntrackedEntries(statusText, repoRoot) { + return parseUntrackedFromStatus(statusText).map((filePath) => { + const absolutePath = resolve(repoRoot, filePath); + if (!existsSync(absolutePath)) { + return { + path: normalizePath(filePath), + bytes: null, + mtimeMs: null, + hash: null, + disposition: 'excluded', + exclusionReason: 'missing_at_snapshot_time' + }; + } + const stats = statSync(absolutePath); + if (!stats.isFile()) { + return { + path: normalizePath(filePath), + bytes: stats.size, + mtimeMs: stats.mtimeMs, + hash: null, + disposition: 'excluded', + exclusionReason: 'not_regular_file' + }; + } + const exclusionReason = shouldExcludeUntracked(filePath, stats.size); + if (exclusionReason) { + return { + path: normalizePath(filePath), + bytes: stats.size, + mtimeMs: stats.mtimeMs, + hash: null, + disposition: 'excluded', + exclusionReason + }; + } + return { + path: normalizePath(filePath), + bytes: stats.size, + mtimeMs: stats.mtimeMs, + hash: hashFile(absolutePath), + disposition: 'hashed', + exclusionReason: null + }; + }); +} + +function lockfileArtifacts(repoRoot, sessionRoot) { + const lockfiles = ['pnpm-lock.yaml', 'package-lock.json', 'yarn.lock', 'bun.lockb']; + return lockfiles + .map((name) => resolve(repoRoot, name)) + .filter((filePath) => existsSync(filePath)) + .map((filePath) => ({ + path: normalizePath(relative(sessionRoot, filePath)), + hash: hashFile(filePath), + bytes: statSync(filePath).size + })); +} + +function runGitCapture(args, repoRoot, logsDir, label) { + const captured = captureCommand('git', args, repoRoot, logsDir, label); + const stdout = readFileSync(captured.stdoutPath, 'utf8'); + return { ...captured, stdout }; +} + +function createReservations(fixtures) { + const repeats = + fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? + fixtures.protocol.thresholds?.claimBearingRunsPerTaskLane ?? + 3; + const evidenceByLane = new Map( + fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record]) + ); + const reservations = []; + for (const task of fixtures.manifest.tasks) { + for (const laneId of fixtures.lanes.broadClaimLaneSet) { + const evidence = evidenceByLane.get(laneId); + const blocked = evidence && BLOCKED_LANE_SETUP_STATUSES.has(evidence.readinessStatus); + for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) { + reservations.push({ + laneId, + taskId: task.instance_id, + repeatIndex, + status: blocked ? 'terminal_missing_evidence' : 'reserved', + terminalStatus: blocked ? 'setup_failed' : null, + reason: blocked ? evidence.readinessStatus : null + }); + } + } + } + return reservations; +} + +function buildRunPaths(sessionRoot, runId) { + const runDir = join(sessionRoot, 'runs', runId); + return { + runDir, + prompt: join(runDir, 'prompt.txt'), + laneCard: join(runDir, 'lane-card.json'), + setupIndex: join(runDir, 'setup-index.json'), + rawTrace: join(runDir, 'raw-trace.json'), + structuredAnswer: join(runDir, 'structured-answer.json'), + trajectory: join(runDir, 'trajectory.json'), + score: join(runDir, 'score.json'), + manifest: join(sessionRoot, 'run-manifest.jsonl') + }; +} + +function buildSetupIndexMeasurementPaths(sessionRoot, laneId) { + const root = join(sessionRoot, 'setup-index', laneId); + const logs = join(root, 'logs'); + return { + root, + logs, + artifact: join(root, 'setup-index.json'), + setupStdout: join(logs, 'setup.stdout.log'), + setupStderr: join(logs, 'setup.stderr.log'), + indexStdout: join(logs, 'index.stdout.log'), + indexStderr: join(logs, 'index.stderr.log') + }; +} + +function artifactHashesForPaths(paths) { + return { + prompt: hashFile(paths.prompt), + laneToolCard: hashFile(paths.laneCard), + setupIndex: hashFile(paths.setupIndex), + rawTrace: hashFile(paths.rawTrace), + structuredAnswer: hashFile(paths.structuredAnswer), + trajectory: hashFile(paths.trajectory), + score: hashFile(paths.score), + runnerSourceHash: runnerSourceHash() + }; +} + +function optionalHashFile(filePath) { + return existsSync(filePath) ? hashFile(filePath) : null; +} + +function commandEvidenceForKind(record, kind) { + return (record?.commands ?? []).find((command) => command.kind === kind) ?? null; +} + +function outputHashForLogs(stdoutPath, stderrPath) { + return sha256(`${readFileSync(stdoutPath, 'utf8')}\n${readFileSync(stderrPath, 'utf8')}`); +} + +function normalizeMeasurementLogPath(sessionRoot, filePath) { + if (!filePath) return null; + return isAbsolute(filePath) ? filePath : join(sessionRoot, filePath); +} + +function validateMeasuredSetupIndex(sessionRoot, laneCard, measurement) { + const errors = []; + if (!measurement || typeof measurement !== 'object') errors.push('measurement must be an object'); + if (measurement?.laneId !== laneCard.laneId) errors.push('measurement laneId mismatch'); + if (measurement?.claimBearing !== false) errors.push('measurement must be non-claim-bearing'); + const setupStatus = measurement?.setupStatus; + const indexStatus = measurement?.indexStatus; + if (!['completed', 'not_required', 'setup_failed'].includes(setupStatus)) + errors.push('invalid setupStatus'); + if (!['completed', 'not_required', 'index_failed'].includes(indexStatus)) + errors.push('invalid indexStatus'); + for (const [field, status] of [ + ['setupDurationMs', setupStatus], + ['indexDurationMs', indexStatus] + ]) { + const duration = measurement?.[field]; + if (typeof duration !== 'number' || !Number.isFinite(duration) || duration < 0) { + errors.push(`${field} must be a finite non-negative number`); + } + if (status === 'completed' && duration <= 0) errors.push(`${field} must be positive when completed`); + } + for (const field of ['setupLogPath', 'indexLogPath']) { + const logPath = normalizeMeasurementLogPath(sessionRoot, measurement?.[field]); + if (!logPath) { + errors.push(`${field} missing`); + continue; + } + if (!isPathInside(sessionRoot, logPath)) { + errors.push(`${field} must stay inside session root`); + } else if (!existsSync(logPath)) { + errors.push(`${field} missing artifact`); + } + } + return errors; +} + +function rowSetupIndexFromMeasurement(measurement) { + return { + setupCommand: measurement.setupCommand, + indexCommand: measurement.indexCommand, + setupDurationMs: measurement.setupDurationMs, + indexDurationMs: measurement.indexDurationMs, + setupLogPath: measurement.setupLogPath, + indexLogPath: measurement.indexLogPath, + setupStatus: measurement.setupStatus, + indexStatus: measurement.indexStatus + }; +} + +function readMeasuredSetupIndex(sessionRoot, laneCard) { + const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId); + if (!existsSync(paths.artifact)) return null; + const measurement = readJson(paths.artifact); + const errors = validateMeasuredSetupIndex(sessionRoot, laneCard, measurement); + if (errors.length > 0) throw new Error(`setup/index measurement invalid for ${laneCard.laneId}:\n- ${errors.join('\n- ')}`); + return rowSetupIndexFromMeasurement(measurement); +} + +function defaultRawNativeSetupIndex(sessionRoot, laneCard) { + const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId); + mkdirSync(paths.logs, { recursive: true }); + writeTextArtifact(paths.setupStdout, 'raw-native setup not required\n'); + writeTextArtifact(paths.setupStderr, ''); + writeTextArtifact(paths.indexStdout, 'raw-native index not required\n'); + writeTextArtifact(paths.indexStderr, ''); + return { + laneId: laneCard.laneId, + claimBearing: false, + measuredAt: new Date().toISOString(), + measurementMode: 'not_required', + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: paths.setupStdout, + indexLogPath: paths.indexStdout, + setupStatus: 'not_required', + indexStatus: 'not_required', + commands: [ + { + kind: 'setup', + command: laneCard.setupCommand, + executed: false, + exitCode: 0, + durationMs: 0, + stdoutLogPath: paths.setupStdout, + stderrLogPath: paths.setupStderr, + outputHash: outputHashForLogs(paths.setupStdout, paths.setupStderr) + }, + { + kind: 'index', + command: laneCard.indexCommand, + executed: false, + exitCode: 0, + durationMs: 0, + stdoutLogPath: paths.indexStdout, + stderrLogPath: paths.indexStderr, + outputHash: outputHashForLogs(paths.indexStdout, paths.indexStderr) + } + ] + }; +} + +function laneTelemetryOverrides() { + const raw = process.env.CONTEXTBENCH_LANE_TELEMETRY_JSON; + if (!raw) return {}; + try { + const parsed = JSON.parse(raw); + return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {}; + } catch { + throw new Error('CONTEXTBENCH_LANE_TELEMETRY_JSON must be a JSON object'); + } +} + +function buildLaneIsolationEvidence(laneCard) { + const telemetry = laneTelemetryOverrides()[laneCard.laneId]; + const observedTools = Array.isArray(telemetry?.observedTools) + ? telemetry.observedTools.filter((tool) => typeof tool === 'string') + : []; + const disallowedObserved = observedTools.filter((tool) => laneCard.disallowedTools.includes(tool)); + const unknownObserved = observedTools.filter((tool) => !laneCard.allowedTools.includes(tool)); + const expectedContextTool = laneCard.contextTools[0] ?? laneCard.laneId; + const rawNative = laneCard.laneId === 'raw-native'; + const expectedObserved = rawNative + ? observedTools.length > 0 && unknownObserved.length === 0 + : observedTools.length === 1 && observedTools[0] === expectedContextTool; + const violations = [...disallowedObserved, ...unknownObserved].map((tool) => `unexpected_tool_${tool}`); + const proven = Boolean(telemetry?.proofSource) && expectedObserved && violations.length === 0; + return { + laneId: laneCard.laneId, + proven, + sourceKind: telemetry?.proofSource ? 'env_override' : 'not_captured', + proofSource: typeof telemetry?.proofSource === 'string' ? telemetry.proofSource : 'not_captured', + expectedContextTool, + allowedTools: laneCard.allowedTools, + disallowedTools: laneCard.disallowedTools, + observedTools, + violations + }; +} + +function runOfficialEvaluatorForAttempt(fixtures, paths, task, executor, status) { + if (executor === 'fake') { + return { + status, + mode: 'diagnostic_fallback', + ...diagnosticFallbackScoring(fixtures, 'fake_executor_smoke_only') + }; + } + if (status !== 'completed') { + return { + status, + mode: 'diagnostic_fallback', + ...diagnosticFallbackScoring(fixtures, 'agent_attempt_not_completed') + }; + } + if (executorCommandIsOverridden(executor) && !process.env.CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND) { + return { + status, + mode: 'diagnostic_fallback', + ...diagnosticFallbackScoring(fixtures, 'overridden_executor_smoke_no_official_evaluator') + }; + } + + const officialGoldPath = join(paths.runDir, 'official-gold-input.json'); + const officialOutputPath = join(paths.runDir, 'official-results.jsonl'); + const stdoutPath = join(paths.runDir, 'official-evaluator.stdout.log'); + const stderrPath = join(paths.runDir, 'official-evaluator.stderr.log'); + writeJson(officialGoldPath, { + instance_id: task.instance_id, + gold_context_ref: task.gold_context_ref, + gold_context_hash: task.gold_context_hash, + hash_canonicalization_version: task.hash_canonicalization_version + }); + + const evaluator = officialEvaluatorCommandParts(); + const evaluatorArgs = [ + ...evaluator.prefixArgs, + '--gold', + officialGoldPath, + '--pred', + paths.trajectory, + '--out', + officialOutputPath + ]; + const result = spawnSync(evaluator.command, evaluatorArgs, { + encoding: 'utf8', + cwd: paths.runDir, + timeout: fixtures.protocol.budgets.defaults.timeoutSeconds * 1000 + }); + const stdout = result.stdout ?? ''; + const stderr = result.stderr ?? ''; + writeTextArtifact(stdoutPath, stdout); + writeTextArtifact(stderrPath, stderr); + const exitStatus = typeof result.status === 'number' ? result.status : null; + const command = `${evaluator.command} ${evaluatorArgs.join(' ')}`; + const outputValidation = validateOfficialEvaluatorOutputEnvelope(officialOutputPath, task); + if (exitStatus === 0 && outputValidation.valid) { + return { + status: 'completed', + mode: 'official_evaluator', + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true, + command, + claimBearing: fixtures.protocol.claimAllowed === true, + stdoutPath, + stderrPath, + outputPath: officialOutputPath, + outputHash: hashFile(officialOutputPath), + stdoutHash: hashFile(stdoutPath), + stderrHash: hashFile(stderrPath), + exitCode: exitStatus, + exitStatus + }; + } + return { + status: 'judge_failed', + mode: 'diagnostic_fallback', + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true, + command, + claimBearing: false, + fallbackReason: outputValidation.reason ?? 'official_evaluator_failed', + stdoutPath, + stderrPath, + outputPath: officialOutputPath, + outputHash: optionalHashFile(officialOutputPath), + stdoutHash: hashFile(stdoutPath), + stderrHash: hashFile(stderrPath), + exitCode: exitStatus, + exitStatus, + spawnError: result.error?.message ?? null + }; +} + +function validateOfficialEvaluatorOutputEnvelope(outputPath, task) { + if (!existsSync(outputPath)) return { valid: false, reason: 'official_evaluator_missing_output' }; + const content = readFileSync(outputPath, 'utf8'); + if (!content.trim()) return { valid: false, reason: 'official_evaluator_empty_output' }; + const lines = content.split(/\r?\n/).filter((line) => line.trim().length > 0); + const expectedTaskIds = new Set([task.instance_id, task.original_inst_id].filter(Boolean)); + for (const line of lines) { + let parsed; + try { + parsed = JSON.parse(line); + } catch { + return { valid: false, reason: 'official_evaluator_malformed_jsonl' }; + } + if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) { + return { valid: false, reason: 'official_evaluator_non_object_jsonl' }; + } + const declaredTaskId = parsed.instance_id ?? parsed.task_id ?? parsed.taskId ?? parsed.id; + if (typeof declaredTaskId === 'string' && expectedTaskIds.size > 0 && !expectedTaskIds.has(declaredTaskId)) { + return { valid: false, reason: 'official_evaluator_task_mismatch' }; + } + } + return { valid: true, reason: null }; +} + +function appendRunManifestRow(sessionRoot, row) { + appendFileSync(join(sessionRoot, 'run-manifest.jsonl'), `${JSON.stringify(row)}\n`, 'utf8'); +} + +function buildManifestRowForArtifacts(params) { + return { + run_id: params.runId, + protocol_version: params.fixtures.protocol.protocolVersion, + protocol_hash: hashObject(params.fixtures.protocol), + task_manifest_hash: params.fixtures.manifest.manifest_hash, + lane_id: params.laneCard.laneId, + task_id: params.task.instance_id, + repeat_index: params.repeatIndex, + status: params.status, + started_at: params.startedAt, + completed_at: params.completedAt, + raw_trace_path: params.paths.rawTrace, + structured_answer_path: params.paths.structuredAnswer, + trajectory_path: params.paths.trajectory, + score_path: params.paths.score, + setup_index_path: params.paths.setupIndex, + prompt_path: params.paths.prompt, + lane_tool_card_path: params.paths.laneCard, + setupIndex: params.setupIndex, + taskExecution: { + model: params.model, + timeoutSeconds: params.fixtures.protocol.budgets.defaults.timeoutSeconds, + maxContextTokens: params.fixtures.protocol.budgets.defaults.maxContextTokens, + maxAnswerTokens: params.fixtures.protocol.budgets.defaults.maxAnswerTokens, + startedAt: params.startedAt, + completedAt: params.completedAt, + taskWallTimeMs: new Date(params.completedAt).getTime() - new Date(params.startedAt).getTime(), + executor: params.executor + }, + scoring: params.scoring, + hashes: artifactHashesForPaths(params.paths) + }; +} + +function writeBlockedRunRows(sessionRoot, fixtures, reservations) { + const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card])); + const evidenceByLane = new Map( + fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record]) + ); + const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task])); + for (const reservation of reservations.filter( + (slot) => slot.status === 'terminal_missing_evidence' + )) { + const laneCard = cardsByLane.get(reservation.laneId); + const task = tasksById.get(reservation.taskId); + const evidence = evidenceByLane.get(reservation.laneId); + if (!laneCard || !task || !evidence) continue; + const runId = sanitize( + `${laneCard.laneId}-${task.instance_id}-${reservation.repeatIndex}-missing-evidence` + ); + const paths = buildRunPaths(sessionRoot, runId); + const startedAt = new Date().toISOString(); + const completedAt = startedAt; + const setupIndex = { + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: evidence.setupDurationMs ?? 0, + indexDurationMs: evidence.indexDurationMs ?? 0, + setupLogPath: evidence.logReference ?? paths.setupIndex, + indexLogPath: evidence.logReference ?? paths.setupIndex, + setupStatus: 'setup_failed', + indexStatus: evidence.readinessStatus === 'index_failed' ? 'index_failed' : 'not_required' + }; + const prompt = `Terminal missing evidence for ${task.instance_id} in ${laneCard.laneId}; no agent task prompt executed.`; + writeTextArtifact(paths.prompt, prompt); + writeJson(paths.laneCard, laneCard); + writeJson(paths.setupIndex, { ...setupIndex, evidence }); + writeJson(paths.rawTrace, { + executor: 'none', + runnerHash: runnerSourceHash(), + claimBearing: false, + status: 'setup_failed', + laneReadinessStatus: evidence.readinessStatus, + reason: reservation.reason, + laneIsolation: buildLaneIsolationEvidence(laneCard), + scriptedAgentDecisions: false + }); + writeJson(paths.structuredAnswer, { + status: 'not_attempted_missing_evidence', + claimBearing: false + }); + writeJson(paths.trajectory, { status: 'not_attempted_missing_evidence', pred_files: [] }); + writeJson(paths.score, { + status: 'setup_failed', + mode: 'missing_evidence', + claimBearing: false, + reason: reservation.reason + }); + appendRunManifestRow( + sessionRoot, + buildManifestRowForArtifacts({ + runId, + fixtures, + laneCard, + task, + repeatIndex: reservation.repeatIndex, + status: 'setup_failed', + startedAt, + completedAt, + paths, + setupIndex, + executor: 'fake', + model: 'not-run-missing-evidence', + scoring: diagnosticFallbackScoring( + fixtures, + `terminal_missing_evidence:${reservation.reason}` + ) + }) + ); + } +} + +function computeSessionHash(session) { + return hashObject({ ...session, sessionHash: '' }); +} + +function writeSession(sessionRoot, session) { + const nextSession = { ...session, updatedAt: new Date().toISOString() }; + nextSession.sessionHash = computeSessionHash(nextSession); + writeJson(join(sessionRoot, 'BASELINE-SESSION.json'), nextSession); + return nextSession; +} + +function readSession(sessionRoot) { + return readJson(join(sessionRoot, 'BASELINE-SESSION.json')); +} + +function refreshArtifactIndex(sessionRoot) { + return listFilesRecursive(sessionRoot) + .filter((filePath) => !filePath.endsWith('BASELINE-SESSION.json')) + .map((filePath) => artifactEntry(filePath, sessionRoot)) + .sort((a, b) => a.path.localeCompare(b.path)); +} + +function createBaselineSnapshot(args) { + if (!args.out) throw new Error('--baseline-snapshot requires --out '); + const fixtures = validateFixtures(); + const repoRoot = process.cwd(); + const sessionRoot = ensureBaselineSessionRoot(args.out); + const phase = baselineSessionPhase(sessionRoot); + const sessionId = sessionRoot.split(/[\\/]/).filter(Boolean).at(-1) ?? 'phase40-session'; + const snapshotDir = join(sessionRoot, 'snapshot'); + const gitDir = join(snapshotDir, 'git'); + const logsDir = join(snapshotDir, 'commands'); + mkdirSync(gitDir, { recursive: true }); + mkdirSync(logsDir, { recursive: true }); + + const status = runGitCapture( + ['status', '--porcelain=v2', '--branch', '--untracked-files=all'], + repoRoot, + logsDir, + 'git-status' + ); + const trackedDiff = runGitCapture(['diff', '--no-ext-diff'], repoRoot, logsDir, 'git-diff'); + const stagedDiff = runGitCapture( + ['diff', '--cached', '--no-ext-diff'], + repoRoot, + logsDir, + 'git-diff-staged' + ); + const diffStat = runGitCapture(['diff', '--stat'], repoRoot, logsDir, 'git-diff-stat'); + const statusPath = join(gitDir, 'status-porcelain-v2.txt'); + const trackedDiffPath = join(gitDir, 'tracked.diff'); + const stagedDiffPath = join(gitDir, 'staged.diff'); + const diffStatPath = join(gitDir, 'diff-stat.txt'); + writeTextArtifact(statusPath, status.stdout); + writeTextArtifact(trackedDiffPath, trackedDiff.stdout); + writeTextArtifact(stagedDiffPath, stagedDiff.stdout); + writeTextArtifact(diffStatPath, diffStat.stdout); + + const reservations = createReservations(fixtures); + const reservationsPath = join(sessionRoot, 'slot-reservations.json'); + writeJson(reservationsPath, { claimBearing: false, reservations }); + writeBlockedRunRows(sessionRoot, fixtures, reservations); + + const snapshotWithoutHash = { + branch: safeExec('git', ['rev-parse', '--abbrev-ref', 'HEAD']), + head: safeExec('git', ['rev-parse', 'HEAD']), + divergence: { + status: 'unavailable', + reason: + 'Phase 40 plan records main as unavailable locally; divergence is captured as unavailable instead of inferred.' + }, + gitStatusPath: normalizePath(relative(sessionRoot, statusPath)), + trackedDiffPath: normalizePath(relative(sessionRoot, trackedDiffPath)), + stagedDiffPath: normalizePath(relative(sessionRoot, stagedDiffPath)), + diffStatPath: normalizePath(relative(sessionRoot, diffStatPath)), + untracked: captureUntrackedEntries(status.stdout, repoRoot), + lockfiles: lockfileArtifacts(repoRoot, sessionRoot), + redactedEnvVarNames: redactedEnvVarNames(), + versions: versionSnapshot(), + fixtureHashes: fixtureHashes(), + commandTranscript: [status, trackedDiff, stagedDiff, diffStat].map((entry) => ({ + command: entry.command, + cwd: entry.cwd, + exitCode: entry.exitCode, + stdoutPath: normalizePath(relative(sessionRoot, entry.stdoutPath)), + stderrPath: normalizePath(relative(sessionRoot, entry.stderrPath)), + outputHash: entry.outputHash + })) + }; + const snapshot = { ...snapshotWithoutHash, snapshotHash: hashObject(snapshotWithoutHash) }; + let session = { + sessionId, + phase, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + sessionRoot: normalizePath(sessionRoot), + claimBearing: false, + sealed: false, + snapshot, + reservationsPath: normalizePath(relative(sessionRoot, reservationsPath)), + runManifestPath: 'run-manifest.jsonl', + artifactIndex: [], + sessionHash: '' + }; + session.artifactIndex = refreshArtifactIndex(sessionRoot); + session = writeSession(sessionRoot, session); + console.log(`baseline snapshot wrote ${join(sessionRoot, 'BASELINE-SESSION.json')}`); +} + +function validateBaselineArms(filePath) { + if (!filePath) throw new Error('--baseline-validate-arms requires a fixture path'); + const fixture = readJson(filePath); + const errors = []; + if (fixture.phase !== 40) errors.push('baseline arms fixture must be Phase 40 metadata'); + if (fixture.claimBearing !== false) errors.push('baseline arms must be non-claim-bearing'); + if (!String(fixture.denominatorPolicy ?? '').includes('separate')) { + errors.push('baseline arms must stay separate from required competitor denominators'); + } + const seen = new Set(); + for (const arm of fixture.arms ?? []) { + if (!arm.baselineArmId || seen.has(arm.baselineArmId)) + errors.push(`invalid duplicate baseline arm ${arm.baselineArmId}`); + seen.add(arm.baselineArmId); + if (arm.laneId !== 'codebase-context') + errors.push(`arm ${arm.baselineArmId} must stay under codebase-context`); + if (arm.claimBearing !== false) + errors.push(`arm ${arm.baselineArmId} must be non-claim-bearing`); + if (!Array.isArray(arm.allowedToolSurfaces) || arm.allowedToolSurfaces.length === 0) { + errors.push(`arm ${arm.baselineArmId} needs existing tool surfaces`); + } + if (arm.failurePolicy !== 'record_terminal_diagnostic_failure') { + errors.push(`arm ${arm.baselineArmId} must record failures instead of patching products`); + } + } + if (errors.length > 0) + throw new Error(`baseline arm validation failed:\n- ${errors.join('\n- ')}`); + console.log('baseline arm validation passed'); +} + +function runSetupIndexMeasure(args) { + if (!args.session) throw new Error('--setup-index-measure requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json'))) + throw new Error('baseline session snapshot missing'); + const fixtures = validateFixtures(); + const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card])); + const lanes = args.allReadyLanes ? fixtures.lanes.broadClaimLaneSet : [args.lane]; + let measured = 0; + for (const laneId of lanes) { + const laneCard = cardsByLane.get(laneId); + if (!laneCard) throw new Error(`unknown lane: ${laneId}`); + if (laneCard.laneId !== 'raw-native') { + if (args.allReadyLanes) continue; + throw new Error( + `setup/index measurement for ${laneCard.laneId} requires --setup-index-import until safe isolated command execution is implemented` + ); + } + const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId); + const measurement = defaultRawNativeSetupIndex(sessionRoot, laneCard); + writeJson(paths.artifact, measurement); + measured += 1; + } + const session = readSession(sessionRoot); + session.artifactIndex = refreshArtifactIndex(sessionRoot); + writeSession(sessionRoot, session); + console.log(`setup/index measurement wrote ${measured} lane artifact(s)`); +} + +function runSetupIndexImport(args) { + if (!args.session || !args.lane || !args.input) + throw new Error('--setup-index-import requires --session, --lane, and --input'); + const sessionRoot = ensureBaselineSessionRoot(args.session); + if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json'))) + throw new Error('baseline session snapshot missing'); + const fixtures = validateFixtures(); + const laneCard = fixtures.laneToolCards.cards.find((card) => card.laneId === args.lane); + if (!laneCard) throw new Error(`unknown lane: ${args.lane}`); + const inputPath = isAbsolute(args.input) ? args.input : resolve(process.cwd(), args.input); + const imported = readJson(inputPath); + if (imported.laneId !== laneCard.laneId) + throw new Error(`setup/index import laneId mismatch: expected ${laneCard.laneId}`); + if (imported.claimBearing !== false) + throw new Error('setup/index import must be non-claim-bearing'); + const measurement = { + ...imported, + setupCommand: imported.setupCommand ?? laneCard.setupCommand, + indexCommand: imported.indexCommand ?? laneCard.indexCommand, + setupLogPath: normalizeMeasurementLogPath(sessionRoot, imported.setupLogPath), + indexLogPath: normalizeMeasurementLogPath(sessionRoot, imported.indexLogPath), + importedFrom: normalizePath(inputPath), + importedAt: new Date().toISOString() + }; + const errors = validateMeasuredSetupIndex(sessionRoot, laneCard, measurement); + if (errors.length > 0) + throw new Error(`setup/index import invalid for ${laneCard.laneId}:\n- ${errors.join('\n- ')}`); + const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId); + writeJson(paths.artifact, measurement); + const session = readSession(sessionRoot); + session.artifactIndex = refreshArtifactIndex(sessionRoot); + writeSession(sessionRoot, session); + console.log(`setup/index import wrote ${paths.artifact}`); +} + +function makePrompt(task, laneCard, taskContext = null) { + const lines = [ + `ContextBench task: ${task.instance_id}`, + `Repository: ${task.repo_url}`, + `Base commit: ${task.base_commit}`, + taskContext?.problemStatement + ? `Problem statement hash: ${task.problem_statement_hash}` + : `Problem statement reference: ${task.problem_statement_ref}`, + `Gold context reference is hidden from the solver; do not infer from fixture answers.`, + `Lane: ${laneCard.laneId}`, + `Allowed context tools: ${laneCard.allowedTools.join(', ')}`, + `Disallowed context tools: ${laneCard.disallowedTools.join(', ')}`, + 'Return only JSON with fields: answer, confidence, evidence, filesReferenced, symbolsReferenced, unsupportedClaims, readyToEdit.', + 'Do not use tools outside the lane card. Do not fabricate files or line spans.' + ]; + if (taskContext?.repoCheckoutPath) { + lines.splice(3, 0, `Local checkout: ${taskContext.repoCheckoutPath}`); + } + if (taskContext?.problemStatement) { + lines.push('', 'Problem statement:', taskContext.problemStatement); + } + return lines.join('\n'); +} + +function parseAnswerForBaseline(stdout) { + const trimmed = String(stdout ?? '').trim(); + if (!trimmed) return { answer: null, errors: ['missing_json'] }; + try { + const parsed = JSON.parse(trimmed); + return validateStructuredAnswerObject(parsed); + } catch { + return { answer: null, errors: ['invalid_json'] }; + } +} + +function classifyClaudeCliDiagnostic(stdout, stderr) { + const text = `${stdout ?? ''}\n${stderr ?? ''}`.toLowerCase(); + if (text.includes("you've hit your limit") || text.includes('rate limit')) + return 'claude_rate_limit'; + if (text.includes('not authenticated') || text.includes('please run') || text.includes('login')) { + return 'claude_auth_required'; + } + return null; +} + +function parseClaudeAnswerForBaseline(stdout, stderr) { + const trimmed = String(stdout ?? '').trim(); + const diagnostic = classifyClaudeCliDiagnostic(stdout, stderr); + if (!trimmed) { + return { + answer: null, + errors: diagnostic ? ['missing_json', diagnostic] : ['missing_json'], + toolError: diagnostic !== null + }; + } + + let parsed; + try { + parsed = JSON.parse(trimmed); + } catch { + return { + answer: null, + errors: diagnostic ? ['invalid_json', diagnostic] : ['invalid_json'], + toolError: diagnostic !== null + }; + } + + if (!isRecord(parsed) || parsed.type !== 'result') { + return { ...validateStructuredAnswerObject(parsed), toolError: false }; + } + + if (parsed.is_error === true) { + return { + answer: null, + errors: [`claude_error_${parsed.api_error_status ?? 'unknown'}`], + toolError: true + }; + } + + if (parsed.subtype === 'error_max_structured_output_retries') { + return { answer: null, errors: ['error_max_structured_output_retries'], toolError: false }; + } + + if ('structured_output' in parsed) { + return { ...validateStructuredAnswerObject(parsed.structured_output), toolError: false }; + } + + if (typeof parsed.result === 'string') { + return { ...parseAnswerForBaseline(parsed.result), toolError: false }; + } + + return { answer: null, errors: ['missing_structured_output'], toolError: false }; +} + +function isRecord(value) { + return value !== null && typeof value === 'object' && !Array.isArray(value); +} + +function isStringArray(value) { + return Array.isArray(value) && value.every((entry) => typeof entry === 'string'); +} + +function findAdditionalFields(value, allowedFields, prefix) { + return Object.keys(value) + .filter((field) => !allowedFields.includes(field)) + .map((field) => `additional_${prefix}_${field}`); +} + +function isJsonValue(value) { + if (value === null) return true; + if (['string', 'number', 'boolean'].includes(typeof value)) return true; + if (Array.isArray(value)) return value.every(isJsonValue); + if (!isRecord(value)) return false; + return Object.values(value).every(isJsonValue); +} + +function isValidEvidenceReference(value) { + if (!isRecord(value)) return false; + if (findAdditionalFields(value, EVIDENCE_REFERENCE_FIELDS, 'evidence_field').length > 0) + return false; + if (!isRecord(value.lineRange)) return false; + if (findAdditionalFields(value.lineRange, LINE_RANGE_FIELDS, 'line_range_field').length > 0) + return false; + const { start, end } = value.lineRange; + return ( + typeof value.file === 'string' && + value.file.trim().length > 0 && + typeof value.reason === 'string' && + value.reason.trim().length > 0 && + Number.isInteger(start) && + Number.isInteger(end) && + start > 0 && + end >= start + ); +} + +function validateStructuredAnswerObject(value) { + const errors = []; + if (!isRecord(value)) return { answer: null, errors: ['answer_root_not_object'] }; + for (const field of CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS) { + if (!(field in value)) errors.push(`missing_${field}`); + } + errors.push( + ...findAdditionalFields( + value, + CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS, + 'root_field' + ) + ); + if (!isJsonValue(value.answer)) errors.push('answer_not_json_value'); + if ( + typeof value.confidence !== 'string' || + !['low', 'medium', 'high'].includes(value.confidence) + ) { + errors.push('invalid_confidence'); + } + if (!Array.isArray(value.evidence)) errors.push('evidence_not_array'); + if (!isStringArray(value.filesReferenced)) errors.push('files_referenced_not_string_array'); + if (!isStringArray(value.symbolsReferenced)) errors.push('symbols_referenced_not_string_array'); + if (!isStringArray(value.unsupportedClaims)) errors.push('unsupported_claims_not_string_array'); + if (typeof value.readyToEdit !== 'boolean') errors.push('ready_to_edit_not_boolean'); + const evidence = Array.isArray(value.evidence) ? value.evidence : []; + for (const entry of evidence) { + if (!isRecord(entry)) continue; + errors.push(...findAdditionalFields(entry, EVIDENCE_REFERENCE_FIELDS, 'evidence_field')); + if (isRecord(entry.lineRange)) { + errors.push(...findAdditionalFields(entry.lineRange, LINE_RANGE_FIELDS, 'line_range_field')); + } + } + if (evidence.some((entry) => !isValidEvidenceReference(entry))) + errors.push('malformed_evidence_reference'); + if (errors.length > 0) return { answer: null, errors }; + return { answer: value, errors: [] }; +} + +function defaultFakeAnswer(task) { + return { + answer: { smoke: true, taskId: task.instance_id }, + confidence: 'medium', + evidence: [ + { + file: 'SMOKE_ONLY.md', + lineRange: { start: 1, end: 1 }, + reason: 'fake executor non-claim-bearing smoke evidence' + } + ], + filesReferenced: ['SMOKE_ONLY.md'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }; +} + +function claudeArgsForModel(model) { + const args = ['--print', '--output-format', 'json']; + if (model) args.push('--model', model); + args.push('--json-schema', JSON.stringify(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA)); + return args; +} + +function claudeCommandParts() { + return commandPartsForExecutor('claude'); +} + +function commandPartsForExecutor(executor) { + const envVars = { + claude: 'CONTEXTBENCH_CLAUDE_COMMAND', + codex: 'CONTEXTBENCH_CODEX_COMMAND', + gemini: 'CONTEXTBENCH_GEMINI_COMMAND', + opencode: 'CONTEXTBENCH_OPENCODE_COMMAND' + }; + const defaults = { + claude: 'claude', + codex: 'codex', + gemini: 'gemini', + opencode: 'opencode' + }; + const envVar = envVars[executor]; + if (!envVar) throw new Error(`unsupported executor: ${executor}`); + const override = process.env[envVar]; + if (!override) return { command: defaults[executor], prefixArgs: [] }; + let parts; + try { + parts = JSON.parse(override); + } catch { + throw new Error(`${envVar} must be a JSON array`); + } + if ( + !Array.isArray(parts) || + parts.length === 0 || + parts.some((part) => typeof part !== 'string') + ) { + throw new Error(`${envVar} must be a non-empty JSON string array`); + } + return { command: parts[0], prefixArgs: parts.slice(1) }; +} + +function externalExecutorInvocation(executor, model, prompt, paths) { + const schemaPath = join(paths.runDir, 'answer-schema.json'); + const answerPath = join(paths.runDir, 'executor-answer.json'); + writeJson(schemaPath, structuredAnswerSchemaForExecutor(executor)); + if (executor === 'claude') { + return { + ...commandPartsForExecutor(executor), + args: claudeArgsForModel(model), + input: prompt, + schemaPath, + answerPath: null, + schemaMode: 'native_schema', + outputMode: 'json' + }; + } + if (executor === 'codex') { + const args = [ + 'exec', + '--ephemeral', + '--sandbox', + 'read-only', + '--json', + '--output-schema', + schemaPath, + '--output-last-message', + answerPath + ]; + if (model) args.push('--model', model); + args.push('-'); + return { + ...commandPartsForExecutor(executor), + args, + input: prompt, + schemaPath, + answerPath, + schemaMode: 'native_schema', + outputMode: 'jsonl' + }; + } + if (executor === 'gemini') { + const args = ['--output-format', 'json']; + if (model) args.push('--model', model); + args.push('--prompt', prompt); + return { + ...commandPartsForExecutor(executor), + args, + input: '', + schemaPath, + answerPath: null, + schemaMode: 'prompt_only', + outputMode: 'json' + }; + } + if (executor === 'opencode') { + const args = ['run', '--format', 'json']; + if (model) args.push('--model', model); + args.push(prompt); + return { + ...commandPartsForExecutor(executor), + args, + input: '', + schemaPath, + answerPath: null, + schemaMode: 'prompt_only', + outputMode: 'jsonl' + }; + } + throw new Error(`unsupported executor: ${executor}`); +} + +function structuredAnswerSchemaForExecutor(executor) { + if (executor !== 'codex') return CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA; + return { + ...CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA, + properties: { + ...CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties, + answer: { type: 'string' } + } + }; +} + +function parseExternalAnswerForBaseline(executor, stdout, stderr, answerPath) { + if (executor === 'claude') return parseClaudeAnswerForBaseline(stdout, stderr); + if (executor === 'codex' && answerPath && existsSync(answerPath)) { + const parsed = parseAnswerForBaseline(readFileSync(answerPath, 'utf8')); + if (parsed.answer) return { ...parsed, toolError: false }; + const eventDiagnostic = classifyJsonEventDiagnostic(executor, stdout); + if (eventDiagnostic) + return { answer: null, errors: [...parsed.errors, eventDiagnostic], toolError: true }; + return { ...parsed, toolError: false }; + } + const diagnostic = classifyExternalCliDiagnostic(executor, stdout, stderr); + if (diagnostic) return { answer: null, errors: [diagnostic], toolError: true }; + if (executor === 'gemini') return parseGeminiAnswer(stdout); + if (executor === 'opencode' || executor === 'codex') return parseJsonEventAnswer(stdout); + return { ...parseAnswerForBaseline(stdout), toolError: false }; +} + +function classifyJsonEventDiagnostic(executor, stdout) { + const lines = String(stdout ?? '') + .trim() + .split(/\r?\n/) + .filter(Boolean); + for (const line of lines) { + try { + const parsed = JSON.parse(line); + if (isRecord(parsed) && (parsed.type === 'error' || parsed.error)) return `${executor}_error`; + } catch { + // Non-JSON lines are handled by normal structured-answer parsing. + } + } + return null; +} + +function classifyExternalCliDiagnostic(executor, stdout, stderr) { + const text = `${stdout ?? ''}\n${stderr ?? ''}`.toLowerCase(); + if ( + text.includes('not authenticated') || + text.includes('login') || + text.includes('auth required') + ) { + return `${executor}_auth_required`; + } + if (text.includes('rate limit') || text.includes('quota') || text.includes('limit exceeded')) { + return `${executor}_rate_limit`; + } + return null; +} + +function parseGeminiAnswer(stdout) { + const trimmed = String(stdout ?? '').trim(); + if (!trimmed) return { answer: null, errors: ['missing_json'], toolError: false }; + try { + const parsed = JSON.parse(trimmed); + if (isRecord(parsed) && parsed.error) + return { answer: null, errors: ['gemini_error'], toolError: true }; + if (isRecord(parsed) && typeof parsed.response === 'string') { + return { ...parseAnswerForBaseline(parsed.response), toolError: false }; + } + if (isRecord(parsed) && typeof parsed.text === 'string') { + return { ...parseAnswerForBaseline(parsed.text), toolError: false }; + } + return { ...validateStructuredAnswerObject(parsed), toolError: false }; + } catch { + return { answer: null, errors: ['invalid_json'], toolError: false }; + } +} + +function parseJsonEventAnswer(stdout) { + const trimmed = String(stdout ?? '').trim(); + if (!trimmed) return { answer: null, errors: ['missing_json'], toolError: false }; + const diagnostic = classifyJsonEventDiagnostic('json_event', trimmed); + if (diagnostic) return { answer: null, errors: [diagnostic], toolError: true }; + const lines = trimmed.split(/\r?\n/).filter(Boolean); + for (const line of [...lines].reverse()) { + try { + const parsed = JSON.parse(line); + if (isRecord(parsed)) { + for (const key of ['content', 'message', 'text', 'response']) { + if (typeof parsed[key] === 'string') + return { ...parseAnswerForBaseline(parsed[key]), toolError: false }; + } + if (isRecord(parsed.part) && typeof parsed.part.text === 'string') { + return { ...parseAnswerForBaseline(parsed.part.text), toolError: false }; + } + const direct = validateStructuredAnswerObject(parsed); + if (direct.answer) return { ...direct, toolError: false }; + } + } catch { + // Continue scanning earlier JSONL events before declaring the stream invalid. + } + } + return parseAnswerForBaseline(trimmed); +} + +function fakeStdoutForMode(mode, task) { + if (mode === 'invalid_schema') return 'not json'; + return JSON.stringify(defaultFakeAnswer(task)); +} + +function runKey(laneId, taskId, repeatIndex, prefix = '') { + return `${prefix}${laneId}:${taskId}:${repeatIndex}`; +} + +function existingRunKeys(sessionRoot) { + return new Set( + readManifestRowsIfPresent(sessionRoot).map((row) => + runKey(row.lane_id, row.task_id, row.repeat_index, row.scoring?.baselineArmId ?? '') + ) + ); +} + +function runOneBaselineAttempt( + sessionRoot, + fixtures, + laneCard, + task, + repeatIndex, + executor, + model, + timeoutMs, + fakeAnswerMode, + taskContext = null, + setupIndexOverride = null +) { + const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeatIndex}-${executor}`); + const paths = buildRunPaths(sessionRoot, runId); + const startedAt = new Date().toISOString(); + const prompt = makePrompt(task, laneCard, taskContext); + let stdout = ''; + let stderr = ''; + let answer; + let parseErrors = []; + let processMetadata = { exitStatus: null, signal: null, spawnError: null }; + let externalInvocation = null; + let status = 'completed'; + if (executor === 'fake') { + stdout = fakeStdoutForMode(fakeAnswerMode, task); + const parsed = parseAnswerForBaseline(stdout); + answer = parsed.answer; + parseErrors = parsed.errors; + if (!answer) status = stdout.trim() ? 'invalid_schema' : 'no_answer'; + } else if (['claude', 'codex', 'gemini', 'opencode'].includes(executor)) { + externalInvocation = externalExecutorInvocation(executor, model, prompt, paths); + const result = spawnSync( + externalInvocation.command, + [...externalInvocation.prefixArgs, ...externalInvocation.args], + { + input: externalInvocation.input, + encoding: 'utf8', + timeout: timeoutMs, + cwd: taskContext?.repoCheckoutPath ?? undefined + } + ); + stdout = result.stdout ?? ''; + stderr = result.stderr ?? ''; + processMetadata = { + exitStatus: typeof result.status === 'number' ? result.status : null, + signal: result.signal ?? null, + spawnError: result.error?.message ?? null + }; + if (result.error && result.error.message.includes('ETIMEDOUT')) { + status = 'timeout'; + stderr = `${stderr}\n${executor} invocation timed out after ${timeoutMs}ms`.trim(); + } else if (result.status !== 0 && !stdout.trim()) { + status = 'tool_error'; + } + const parsed = parseExternalAnswerForBaseline( + executor, + stdout, + stderr, + externalInvocation.answerPath + ); + answer = parsed.answer; + parseErrors = parsed.errors; + if (parsed.toolError) status = 'tool_error'; + if (!answer && status === 'completed') status = stdout.trim() ? 'invalid_schema' : 'no_answer'; + } else { + throw new Error('--baseline-run executor must be fake, claude, codex, gemini, or opencode'); + } + if (!answer) { + answer = { + answer: null, + confidence: 'low', + evidence: [], + filesReferenced: [], + symbolsReferenced: [], + unsupportedClaims: ['missing_or_invalid_structured_answer'], + readyToEdit: false + }; + } + const trajectory = buildTrajectory(task, answer); + const setupIndex = setupIndexOverride ?? { + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: paths.setupIndex, + indexLogPath: paths.setupIndex, + setupStatus: laneCard.setupCommand === 'none' ? 'not_required' : 'completed', + indexStatus: laneCard.indexCommand === 'none' ? 'not_required' : 'completed' + }; + const rawTrace = { + executor, + model: executor === 'claude' ? model : 'fake-executor', + runnerHash: runnerSourceHash(), + claimBearing: false, + stdout, + stderr, + timeoutMs, + workingDirectory: taskContext?.repoCheckoutPath ?? process.cwd(), + taskContext: taskContext + ? { + materialized: taskContext.materialized, + errors: taskContext.errors, + repoCheckoutPath: taskContext.repoCheckoutPath, + actualHead: taskContext.actualHead, + statusShort: taskContext.statusShort, + baseCommitVerified: taskContext.baseCommitVerified, + remoteUrl: taskContext.remoteUrl, + problemStatementHash: taskContext.problemStatementHash, + problemStatementHashVerified: taskContext.problemStatementHashVerified, + verificationStrict: taskContext.verificationStrict + } + : null, + exitStatus: processMetadata.exitStatus, + signal: processMetadata.signal, + spawnError: processMetadata.spawnError, + claudeDiagnostic: executor === 'claude' ? classifyClaudeCliDiagnostic(stdout, stderr) : null, + executorDiagnostic: + executor !== 'fake' ? classifyExternalCliDiagnostic(executor, stdout, stderr) : null, + executorArgs: externalInvocation?.args ?? [], + executorCommand: externalInvocation?.command ?? null, + executorSchemaMode: externalInvocation?.schemaMode ?? null, + executorOutputMode: externalInvocation?.outputMode ?? null, + executorSchemaPath: externalInvocation?.schemaPath ?? null, + executorAnswerPath: externalInvocation?.answerPath ?? null, + toolCalls: [], + laneIsolation: buildLaneIsolationEvidence(laneCard), + claudeArgs: + executor === 'claude' ? (externalInvocation?.args ?? claudeArgsForModel(model)) : [], + claudeCommand: + executor === 'claude' ? (externalInvocation?.command ?? claudeCommandParts().command) : null, + structuredAnswerParseErrors: parseErrors, + scriptedAgentDecisions: false, + antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript + }; + writeTextArtifact(paths.prompt, prompt); + writeJson(paths.laneCard, laneCard); + writeJson(paths.setupIndex, setupIndex); + writeJson(paths.rawTrace, rawTrace); + writeJson(paths.structuredAnswer, answer); + writeJson(paths.trajectory, trajectory); + const score = runOfficialEvaluatorForAttempt(fixtures, paths, task, executor, status); + writeJson(paths.score, score); + const completedAt = new Date().toISOString(); + appendRunManifestRow( + sessionRoot, + buildManifestRowForArtifacts({ + runId, + fixtures, + laneCard, + task, + repeatIndex, + status: status === 'completed' && score.status === 'judge_failed' ? 'judge_failed' : status, + startedAt, + completedAt, + paths, + setupIndex, + executor, + model: executor === 'fake' ? 'fake-executor' : model, + scoring: { + officialEvaluatorFirst: score.officialEvaluatorFirst, + officialEvaluatorAttempted: score.officialEvaluatorAttempted, + officialEvaluatorInvoked: score.officialEvaluatorInvoked, + command: score.command, + claimBearing: score.claimBearing, + ...(score.fallbackReason ? { fallbackReason: score.fallbackReason } : {}), + ...(score.stdoutPath ? { stdoutPath: score.stdoutPath } : {}), + ...(score.stderrPath ? { stderrPath: score.stderrPath } : {}) + } + }) + ); +} + +function writeTaskSetupFailedAttempt( + sessionRoot, + fixtures, + laneCard, + task, + repeatIndex, + executor, + model, + timeoutMs, + taskContext +) { + const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeatIndex}-${executor}`); + const paths = buildRunPaths(sessionRoot, runId); + const startedAt = new Date().toISOString(); + const completedAt = startedAt; + const prompt = makePrompt(task, laneCard, taskContext); + const setupIndex = { + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: paths.setupIndex, + indexLogPath: paths.setupIndex, + setupStatus: 'setup_failed', + indexStatus: 'not_required', + taskMaterializationStatus: 'failed', + taskMaterializationErrors: taskContext.errors + }; + const fallbackAnswer = { + answer: null, + confidence: 'low', + evidence: [], + filesReferenced: [], + symbolsReferenced: [], + unsupportedClaims: ['missing_or_invalid_task_context'], + readyToEdit: false + }; + writeTextArtifact(paths.prompt, prompt); + writeJson(paths.laneCard, laneCard); + writeJson(paths.setupIndex, setupIndex); + writeJson(paths.rawTrace, { + executor, + model: executor === 'fake' ? 'fake-executor' : model, + runnerHash: runnerSourceHash(), + claimBearing: false, + status: 'task_setup_failed', + timeoutMs, + workingDirectory: process.cwd(), + taskContext: { + materialized: false, + errors: taskContext.errors, + repoCheckoutPath: taskContext.repoCheckoutPath, + actualHead: taskContext.actualHead, + statusShort: taskContext.statusShort, + baseCommitVerified: taskContext.baseCommitVerified, + remoteUrl: taskContext.remoteUrl, + problemStatementHash: taskContext.problemStatementHash, + problemStatementHashVerified: taskContext.problemStatementHashVerified, + verificationStrict: taskContext.verificationStrict + }, + stdout: '', + stderr: `task context materialization failed: ${taskContext.errors.join(', ')}`, + exitStatus: null, + signal: null, + spawnError: null, + structuredAnswerParseErrors: ['invalid_task_context'], + toolCalls: [], + laneIsolation: buildLaneIsolationEvidence(laneCard), + scriptedAgentDecisions: false, + antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript + }); + writeJson(paths.structuredAnswer, fallbackAnswer); + writeJson(paths.trajectory, buildTrajectory(task, fallbackAnswer)); + writeJson(paths.score, { + status: 'task_setup_failed', + mode: 'materialization_gate', + ...diagnosticFallbackScoring( + fixtures, + `invalid_task_context:${taskContext.errors.join(',')}` + ) + }); + appendRunManifestRow( + sessionRoot, + buildManifestRowForArtifacts({ + runId, + fixtures, + laneCard, + task, + repeatIndex, + status: 'task_setup_failed', + startedAt, + completedAt, + paths, + setupIndex, + executor, + model: executor === 'fake' ? 'fake-executor' : model, + scoring: diagnosticFallbackScoring( + fixtures, + `invalid_task_context:${taskContext.errors.join(',')}` + ) + }) + ); +} + +function setupIndexForBaselineAttempt(sessionRoot, laneCard) { + const measured = readMeasuredSetupIndex(sessionRoot, laneCard); + if (measured) { + if ( + ['completed', 'not_required'].includes(measured.setupStatus) && + ['completed', 'not_required'].includes(measured.indexStatus) + ) { + return measured; + } + return null; + } + if (laneCard.laneId === 'raw-native') { + const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId); + const measurement = defaultRawNativeSetupIndex(sessionRoot, laneCard); + writeJson(paths.artifact, measurement); + return rowSetupIndexFromMeasurement(measurement); + } + return null; +} + +function writeSetupIndexMissingAttempt( + sessionRoot, + fixtures, + laneCard, + task, + repeatIndex, + executor, + model, + timeoutMs, + reason +) { + const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeatIndex}-${executor}`); + const paths = buildRunPaths(sessionRoot, runId); + const startedAt = new Date().toISOString(); + const completedAt = startedAt; + const prompt = `Setup/index measurement missing for ${task.instance_id} in ${laneCard.laneId}; no agent task prompt executed.`; + const setupIndex = { + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: paths.setupIndex, + indexLogPath: paths.setupIndex, + setupStatus: 'setup_failed', + indexStatus: 'not_required' + }; + const fallbackAnswer = { + answer: null, + confidence: 'low', + evidence: [], + filesReferenced: [], + symbolsReferenced: [], + unsupportedClaims: ['missing_setup_index_measurement'], + readyToEdit: false + }; + writeTextArtifact(paths.prompt, prompt); + writeJson(paths.laneCard, laneCard); + writeJson(paths.setupIndex, { ...setupIndex, reason }); + writeJson(paths.rawTrace, { + executor, + model: executor === 'fake' ? 'fake-executor' : model, + runnerHash: runnerSourceHash(), + claimBearing: false, + status: 'setup_failed', + timeoutMs, + workingDirectory: process.cwd(), + stdout: '', + stderr: reason, + exitStatus: null, + signal: null, + spawnError: null, + structuredAnswerParseErrors: ['missing_setup_index_measurement'], + toolCalls: [], + laneIsolation: buildLaneIsolationEvidence(laneCard), + scriptedAgentDecisions: false, + antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript + }); + writeJson(paths.structuredAnswer, fallbackAnswer); + writeJson(paths.trajectory, buildTrajectory(task, fallbackAnswer)); + writeJson(paths.score, { + status: 'setup_failed', + mode: 'setup_index_measurement_gate', + ...diagnosticFallbackScoring(fixtures, `missing_setup_index_measurement:${reason}`) + }); + appendRunManifestRow( + sessionRoot, + buildManifestRowForArtifacts({ + runId, + fixtures, + laneCard, + task, + repeatIndex, + status: 'setup_failed', + startedAt, + completedAt, + paths, + setupIndex, + executor, + model: executor === 'fake' ? 'fake-executor' : model, + scoring: diagnosticFallbackScoring(fixtures, `missing_setup_index_measurement:${reason}`) + }) + ); +} + +function runBaseline(args) { + if (!args.session) throw new Error('--baseline-run requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json'))) + throw new Error('baseline session snapshot missing'); + const fixtures = validateFixtures(); + const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card])); + const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task])); + const evidenceByLane = new Map( + fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record]) + ); + const repeats = args.repeats ?? args.repeat ?? 1; + const taskPayloads = readTaskPayloads(args.taskPayloads); + const maxAttempts = + Number.isInteger(args.maxAttempts) && args.maxAttempts > 0 ? args.maxAttempts : Infinity; + const timeoutMs = + Number.isInteger(args.timeoutMs) && args.timeoutMs > 0 + ? args.timeoutMs + : fixtures.protocol.budgets.defaults.timeoutSeconds * 1000; + const existing = existingRunKeys(sessionRoot); + let attempted = 0; + const lanes = args.allReadyLanes + ? fixtures.lanes.broadClaimLaneSet.filter( + (laneId) => evidenceByLane.get(laneId)?.readinessStatus === 'ready_for_phase40' + ) + : [args.lane]; + const tasks = args.taskId + ? [args.taskId] + : fixtures.manifest.tasks.map((task) => task.instance_id); + for (const laneId of lanes) { + const laneCard = cardsByLane.get(laneId); + if (!laneCard) throw new Error(`unknown lane: ${laneId}`); + if (BLOCKED_LANE_SETUP_STATUSES.has(evidenceByLane.get(laneId)?.readinessStatus)) continue; + for (const taskId of tasks) { + const task = tasksById.get(taskId); + if (!task) throw new Error(`unknown task-id: ${taskId}`); + for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) { + if (existing.has(runKey(laneCard.laneId, task.instance_id, repeatIndex))) continue; + if (attempted >= maxAttempts) break; + const executor = args.executor ?? 'fake'; + const measuredSetupIndex = setupIndexForBaselineAttempt(sessionRoot, laneCard); + if (!measuredSetupIndex) { + writeSetupIndexMissingAttempt( + sessionRoot, + fixtures, + laneCard, + task, + repeatIndex, + executor, + args.model ?? 'unspecified', + timeoutMs, + `${laneCard.laneId} requires --setup-index-import before task execution` + ); + attempted += 1; + continue; + } + const taskContext = resolveTaskContext(task, taskPayloads, executor); + if (executor !== 'fake' && !taskContext.materialized) { + writeTaskSetupFailedAttempt( + sessionRoot, + fixtures, + laneCard, + task, + repeatIndex, + executor, + args.model ?? 'unspecified', + timeoutMs, + taskContext + ); + attempted += 1; + continue; + } + runOneBaselineAttempt( + sessionRoot, + fixtures, + laneCard, + task, + repeatIndex, + executor, + args.model ?? 'unspecified', + timeoutMs, + args.fakeAnswerMode ?? 'valid', + taskContext, + measuredSetupIndex + ); + attempted += 1; + } + if (attempted >= maxAttempts) break; + } + if (attempted >= maxAttempts) break; + } + const session = readSession(sessionRoot); + session.artifactIndex = refreshArtifactIndex(sessionRoot); + writeSession(sessionRoot, session); + console.log( + `baseline run updated ${join(sessionRoot, 'run-manifest.jsonl')} (${attempted} new attempts)` + ); +} + +function runOneCodebaseContextArmAttempt( + sessionRoot, + fixtures, + laneCard, + task, + arm, + repeatIndex, + executor, + model, + timeoutMs, + fakeAnswerMode, + taskContext = null +) { + const runId = sanitize(`${arm.baselineArmId}-${task.instance_id}-${repeatIndex}-${executor}`); + const paths = buildRunPaths(sessionRoot, runId); + const startedAt = new Date().toISOString(); + const prompt = [ + makePrompt(task, laneCard, taskContext), + `Diagnostic baseline arm: ${arm.baselineArmId}`, + `Allowed existing codebase-context surfaces for this arm: ${arm.allowedToolSurfaces.join(', ')}`, + 'This diagnostic arm is not a required competitor lane denominator and is not claim-bearing.' + ].join('\n'); + let stdout = ''; + let stderr = ''; + let answer; + let parseErrors = []; + let processMetadata = { exitStatus: null, signal: null, spawnError: null }; + let status = 'completed'; + if (executor !== 'fake' && taskContext && !taskContext.materialized) { + status = 'task_setup_failed'; + stderr = `task context materialization failed: ${taskContext.errors.join(', ')}`; + parseErrors = ['invalid_task_context']; + } else if (executor === 'fake') { + stdout = fakeStdoutForMode(fakeAnswerMode, task); + const parsed = parseAnswerForBaseline(stdout); + answer = parsed.answer; + parseErrors = parsed.errors; + if (!answer) status = stdout.trim() ? 'invalid_schema' : 'no_answer'; + } else if (executor === 'claude') { + const claudeArgs = claudeArgsForModel(model); + const claudeCommand = claudeCommandParts(); + const result = spawnSync(claudeCommand.command, [...claudeCommand.prefixArgs, ...claudeArgs], { + input: prompt, + encoding: 'utf8', + timeout: timeoutMs, + cwd: taskContext?.repoCheckoutPath ?? undefined + }); + stdout = result.stdout ?? ''; + stderr = result.stderr ?? ''; + processMetadata = { + exitStatus: typeof result.status === 'number' ? result.status : null, + signal: result.signal ?? null, + spawnError: result.error?.message ?? null + }; + if (result.error && result.error.message.includes('ETIMEDOUT')) { + status = 'timeout'; + stderr = `${stderr}\nClaude invocation timed out after ${timeoutMs}ms`.trim(); + } else if (result.status !== 0 && !stdout.trim()) { + status = 'tool_error'; + } + const parsed = parseClaudeAnswerForBaseline(stdout, stderr); + answer = parsed.answer; + parseErrors = parsed.errors; + if (parsed.toolError) status = 'tool_error'; + if (!answer && status === 'completed') status = stdout.trim() ? 'invalid_schema' : 'no_answer'; + } else { + throw new Error('--baseline-run-codebase-context-arms executor must be fake or claude'); + } + if (!answer) { + answer = { + answer: null, + confidence: 'low', + evidence: [], + filesReferenced: [], + symbolsReferenced: [], + unsupportedClaims: ['missing_or_invalid_structured_answer'], + readyToEdit: false + }; + } + const setupIndex = { + setupCommand: arm.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: paths.setupIndex, + indexLogPath: paths.setupIndex, + setupStatus: 'completed', + indexStatus: 'completed' + }; + const trajectory = buildTrajectory(task, answer); + const score = { + status: status === 'completed' ? 'judge_failed' : status, + mode: 'diagnostic_fallback', + ...diagnosticFallbackScoring( + fixtures, + executor === 'fake' + ? 'fake_executor_diagnostic_arm_smoke_only' + : 'official_evaluator_not_invoked_by_runner_smoke', + { baselineArmId: arm.baselineArmId } + ) + }; + writeTextArtifact(paths.prompt, prompt); + writeJson(paths.laneCard, { ...laneCard, diagnosticBaselineArm: arm }); + writeJson(paths.setupIndex, { ...setupIndex, diagnosticBaselineArm: arm }); + writeJson(paths.rawTrace, { + executor, + model: executor === 'claude' ? model : 'fake-executor', + runnerHash: runnerSourceHash(), + claimBearing: false, + baselineArmId: arm.baselineArmId, + stdout, + stderr, + timeoutMs, + workingDirectory: taskContext?.repoCheckoutPath ?? process.cwd(), + taskContext: taskContext + ? { + materialized: taskContext.materialized, + errors: taskContext.errors, + repoCheckoutPath: taskContext.repoCheckoutPath, + actualHead: taskContext.actualHead, + statusShort: taskContext.statusShort, + baseCommitVerified: taskContext.baseCommitVerified, + remoteUrl: taskContext.remoteUrl, + problemStatementHash: taskContext.problemStatementHash, + problemStatementHashVerified: taskContext.problemStatementHashVerified, + verificationStrict: taskContext.verificationStrict + } + : null, + exitStatus: processMetadata.exitStatus, + signal: processMetadata.signal, + spawnError: processMetadata.spawnError, + claudeDiagnostic: executor === 'claude' ? classifyClaudeCliDiagnostic(stdout, stderr) : null, + toolCalls: [], + laneIsolation: buildLaneIsolationEvidence(laneCard), + claudeArgs: executor === 'claude' ? claudeArgsForModel(model) : [], + claudeCommand: executor === 'claude' ? claudeCommandParts().command : null, + structuredAnswerParseErrors: parseErrors, + scriptedAgentDecisions: false, + antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript + }); + writeJson(paths.structuredAnswer, answer); + writeJson(paths.trajectory, trajectory); + writeJson(paths.score, score); + const completedAt = new Date().toISOString(); + appendRunManifestRow( + sessionRoot, + buildManifestRowForArtifacts({ + runId, + fixtures, + laneCard, + task, + repeatIndex, + status, + startedAt, + completedAt, + paths, + setupIndex, + executor, + model: executor === 'fake' ? 'fake-executor' : model, + scoring: diagnosticFallbackScoring(fixtures, score.fallbackReason, { + baselineArmId: arm.baselineArmId + }) + }) + ); +} + +function runBaselineCodebaseContextArms(args) { + if (!args.session) + throw new Error('--baseline-run-codebase-context-arms requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json'))) + throw new Error('baseline session snapshot missing'); + validateBaselineArms(FIXTURES.codebaseContextBaselineArms); + const fixtures = validateFixtures(); + const arms = readJson(FIXTURES.codebaseContextBaselineArms).arms ?? []; + const laneCard = fixtures.laneToolCards.cards.find((card) => card.laneId === 'codebase-context'); + if (!laneCard) throw new Error('codebase-context lane card missing'); + const tasks = args.taskId + ? fixtures.manifest.tasks.filter((task) => task.instance_id === args.taskId) + : fixtures.manifest.tasks; + if (args.taskId && tasks.length === 0) throw new Error(`unknown task-id: ${args.taskId}`); + const repeats = args.repeats ?? args.repeat ?? 1; + const maxAttempts = + Number.isInteger(args.maxAttempts) && args.maxAttempts > 0 ? args.maxAttempts : Infinity; + const timeoutMs = + Number.isInteger(args.timeoutMs) && args.timeoutMs > 0 + ? args.timeoutMs + : fixtures.protocol.budgets.defaults.timeoutSeconds * 1000; + const existing = existingRunKeys(sessionRoot); + const taskPayloads = readTaskPayloads(args.taskPayloads); + let attempted = 0; + for (const arm of arms) { + for (const task of tasks) { + for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) { + if ( + existing.has(runKey('codebase-context', task.instance_id, repeatIndex, arm.baselineArmId)) + ) + continue; + if (attempted >= maxAttempts) break; + const executor = args.executor ?? 'fake'; + const taskContext = resolveTaskContext(task, taskPayloads, executor); + runOneCodebaseContextArmAttempt( + sessionRoot, + fixtures, + laneCard, + task, + arm, + repeatIndex, + executor, + args.model ?? 'unspecified', + timeoutMs, + args.fakeAnswerMode ?? 'valid', + taskContext + ); + attempted += 1; + } + if (attempted >= maxAttempts) break; + } + if (attempted >= maxAttempts) break; + } + const session = readSession(sessionRoot); + session.artifactIndex = refreshArtifactIndex(sessionRoot); + writeSession(sessionRoot, session); + console.log( + `baseline codebase-context diagnostic arms updated ${join(sessionRoot, 'run-manifest.jsonl')} (${attempted} new attempts)` + ); +} + +function readManifestRowsIfPresent(sessionRoot) { + const manifestPath = join(sessionRoot, 'run-manifest.jsonl'); + if (!existsSync(manifestPath)) return []; + const content = readFileSync(manifestPath, 'utf8').trim(); + if (!content) return []; + return content.split('\n').map((line) => JSON.parse(line)); +} + +function validateSessionPaths(sessionRoot, rows, errors) { + for (const row of rows) { + for (const key of [ + 'raw_trace_path', + 'structured_answer_path', + 'trajectory_path', + 'score_path', + 'setup_index_path', + 'prompt_path', + 'lane_tool_card_path' + ]) { + const value = row[key]; + if (!value || !isAbsolute(value)) errors.push(`row ${row.run_id} ${key} must be absolute`); + else if (!isPathInside(sessionRoot, value)) + errors.push(`row ${row.run_id} ${key} is outside session root`); + else if (!existsSync(value)) errors.push(`row ${row.run_id} ${key} missing artifact`); + } + if (row.setupIndex && 'taskWallTimeMs' in row.setupIndex) + errors.push(`row ${row.run_id} mixes task time into setupIndex`); + if (row.scoring?.claimBearing !== false) + errors.push( + `row ${row.run_id} scoring must be non-claim-bearing while protocol claimAllowed=false` + ); + } +} + +function phase42RowKey(row) { + return `${row.lane_id}\u0000${row.task_id}\u0000${row.repeat_index}`; +} + +function phase42ExpectedKeys(fixtures) { + const keys = new Set(); + const repeats = fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? 3; + for (const laneId of fixtures.lanes.broadClaimLaneSet) { + for (const task of fixtures.manifest.tasks) { + for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) { + keys.add(`${laneId}\u0000${task.instance_id}\u0000${repeatIndex}`); + } + } + } + return keys; +} + +function phase42LanePolicies(fixtures) { + return Object.fromEntries( + fixtures.laneToolCards.cards.map((card) => [ + card.laneId, + { + laneId: card.laneId, + expectedContextTool: card.contextTools[0] ?? card.laneId, + allowedTools: card.allowedTools, + disallowedTools: card.disallowedTools, + ...(card.laneId === 'raw-native' ? { allowMultipleObservedTools: true } : {}) + } + ]) + ); +} + +function phase42ReadJsonArtifact(filePath, readErrors, runId, label) { + if (!filePath || !existsSync(filePath)) { + readErrors.push({ runId, path: filePath ?? '', reason: `${label}_missing` }); + return null; + } + try { + return readJson(filePath); + } catch (error) { + readErrors.push({ + runId, + path: filePath, + reason: `${label}_invalid_json:${error instanceof Error ? error.message : String(error)}` + }); + return null; + } +} + +function phase42HashArtifact(filePath, artifactHashesByPath, readErrors, runId, label) { + if (!filePath || !existsSync(filePath)) { + readErrors.push({ runId, path: filePath ?? '', reason: `${label}_missing` }); + return null; + } + const hash = hashFile(filePath); + artifactHashesByPath[filePath] = hash; + return hash; +} + +function phase42CollectArtifactHashes(row, score, artifactHashesByPath, readErrors, integrityErrors) { + for (const [label, filePath] of [ + ['raw_trace', row.raw_trace_path, 'rawTrace'], + ['structured_answer', row.structured_answer_path, 'structuredAnswer'], + ['trajectory', row.trajectory_path, 'trajectory'], + ['score', row.score_path, 'score'], + ['setup_index', row.setup_index_path, 'setupIndex'], + ['prompt', row.prompt_path, 'prompt'], + ['lane_tool_card', row.lane_tool_card_path, 'laneToolCard'] + ]) { + const actualHash = phase42HashArtifact(filePath, artifactHashesByPath, readErrors, row.run_id, label); + const expectedHash = row.hashes?.[label === 'lane_tool_card' ? 'laneToolCard' : label === 'setup_index' ? 'setupIndex' : label === 'raw_trace' ? 'rawTrace' : label === 'structured_answer' ? 'structuredAnswer' : label]; + if (actualHash && !expectedHash) { + integrityErrors.push({ + runId: row.run_id, + path: filePath, + reason: `${label}_manifest_hash_missing`, + expectedHash: null, + actualHash + }); + } else if (actualHash && expectedHash && actualHash !== expectedHash) { + integrityErrors.push({ + runId: row.run_id, + path: filePath, + reason: `${label}_hash_mismatch`, + expectedHash, + actualHash + }); + } + } + for (const [label, filePath] of [ + ['official_output', score?.outputPath], + ['official_stdout', score?.stdoutPath], + ['official_stderr', score?.stderrPath] + ]) { + if (filePath) phase42HashArtifact(filePath, artifactHashesByPath, readErrors, row.run_id, label); + } +} + +function phase42ArtifactsForRow(row, readErrors) { + const rawTrace = phase42ReadJsonArtifact(row.raw_trace_path, readErrors, row.run_id, 'raw_trace'); + const score = phase42ReadJsonArtifact(row.score_path, readErrors, row.run_id, 'score'); + const setupIndex = phase42ReadJsonArtifact( + row.setup_index_path, + readErrors, + row.run_id, + 'setup_index' + ); + return { + rawTrace: rawTrace + ? { + executor: rawTrace.executor, + model: rawTrace.model, + runnerHash: rawTrace.runnerHash + } + : undefined, + score: score + ? { + status: score.status, + mode: score.mode, + claimBearing: score.claimBearing, + officialEvaluatorInvoked: score.officialEvaluatorInvoked, + command: score.command, + exitCode: score.exitCode, + outputPath: score.outputPath, + outputHash: score.outputHash, + stdoutPath: score.stdoutPath, + stderrPath: score.stderrPath, + stdoutHash: score.stdoutHash, + stderrHash: score.stderrHash + } + : undefined, + setupIndex: setupIndex + ? { + setupStatus: setupIndex.setupStatus, + indexStatus: setupIndex.indexStatus, + setupDurationMs: setupIndex.setupDurationMs, + indexDurationMs: setupIndex.indexDurationMs, + setupLogPath: setupIndex.setupLogPath, + indexLogPath: setupIndex.indexLogPath + } + : undefined, + laneIsolation: rawTrace?.laneIsolation + ? { + laneId: rawTrace.laneIsolation.laneId, + proven: rawTrace.laneIsolation.proven, + sourceKind: rawTrace.laneIsolation.sourceKind, + expectedContextTool: rawTrace.laneIsolation.expectedContextTool, + allowedTools: rawTrace.laneIsolation.allowedTools ?? [], + observedTools: rawTrace.laneIsolation.observedTools ?? [], + violations: rawTrace.laneIsolation.violations ?? [] + } + : undefined, + rawScore: score + }; +} + +function loadPhase42SessionEvidence(sessionRoot, fixtures) { + const sessionPath = join(sessionRoot, 'BASELINE-SESSION.json'); + const reservationPath = join(sessionRoot, 'slot-reservations.json'); + const session = existsSync(sessionPath) ? readJson(sessionPath) : null; + const reservations = existsSync(reservationPath) + ? (readJson(reservationPath).reservations ?? []) + : []; + const rows = readManifestRowsIfPresent(sessionRoot); + const expectedKeys = phase42ExpectedKeys(fixtures); + const requiredRows = []; + const supplementalRows = []; + const unexpectedRows = []; + for (const row of rows) { + if (row.scoring?.baselineArmId) { + supplementalRows.push(row); + continue; + } + if (expectedKeys.has(phase42RowKey(row))) requiredRows.push(row); + else unexpectedRows.push(row); + } + + const readErrors = []; + const integrityErrors = []; + const artifactHashesByPath = {}; + const artifactsByRunId = {}; + for (const row of requiredRows) { + const artifacts = phase42ArtifactsForRow(row, readErrors); + phase42CollectArtifactHashes( + row, + artifacts.rawScore, + artifactHashesByPath, + readErrors, + integrityErrors + ); + delete artifacts.rawScore; + artifactsByRunId[row.run_id] = artifacts; + } + + const runnerHashes = requiredRows + .map((row) => row.hashes?.runnerSourceHash) + .filter((hash) => typeof hash === 'string' && hash.length > 0); + const uniqueRunnerHashes = [...new Set(runnerHashes)]; + const expectedRunnerHash = uniqueRunnerHashes.length === 1 ? uniqueRunnerHashes[0] : undefined; + const repeats = fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? 3; + return { + session, + reservations, + requiredRows, + supplementalRows, + unexpectedRows, + readErrors, + integrityErrors, + gateInput: { + evidenceMode: 'artifact_verified', + protocol: { + claimAllowed: fixtures.protocol.claimAllowed, + benchmarkTarget: { + officialEvaluatorFirst: fixtures.protocol.benchmarkTarget.officialEvaluatorFirst + } + }, + requiredLaneIds: fixtures.lanes.broadClaimLaneSet, + requiredTaskIds: fixtures.manifest.tasks.map((task) => task.instance_id), + requiredRepeats: repeats, + expectedTotalRows: fixtures.lanes.broadClaimLaneSet.length * fixtures.manifest.tasks.length * repeats, + expectedProtocolHash: hashObject(fixtures.protocol), + expectedTaskManifestHash: fixtures.manifest.manifest_hash, + lanePoliciesById: phase42LanePolicies(fixtures), + rows: requiredRows, + artifactsByRunId, + artifactHashesByPath, + expectedRunnerHash, + currentRunnerHash: runnerSourceHash() + } + }; +} + +function phase42HasMeasuredSetupIndex(row, evidence) { + if (!evidence) return false; + const setupDuration = evidence.setupDurationMs; + const indexDuration = evidence.indexDurationMs; + if (typeof setupDuration !== 'number' || typeof indexDuration !== 'number') return false; + if (!Number.isFinite(setupDuration) || !Number.isFinite(indexDuration)) return false; + if (!evidence.setupStatus || !evidence.indexStatus) return false; + if (!evidence.setupLogPath || !evidence.indexLogPath) return false; + if (!['completed', 'not_required'].includes(evidence.setupStatus)) return false; + if (!['completed', 'not_required'].includes(evidence.indexStatus)) return false; + if (evidence.setupStatus === 'completed' && setupDuration <= 0) return false; + if (evidence.indexStatus === 'completed' && indexDuration <= 0) return false; + return ( + row.setupIndex.setupStatus === evidence.setupStatus && + row.setupIndex.indexStatus === evidence.indexStatus && + row.setupIndex.setupDurationMs === evidence.setupDurationMs && + row.setupIndex.indexDurationMs === evidence.indexDurationMs && + row.setupIndex.setupLogPath === evidence.setupLogPath && + row.setupIndex.indexLogPath === evidence.indexLogPath + ); +} + +function phase42HasSha256Hash(value) { + return /^sha256:[a-f0-9]{64}$/.test(value ?? ''); +} + +function phase42HasOfficialEvaluatorProof(row, score, artifactHashesByPath) { + return ( + row.scoring.officialEvaluatorFirst === true && + row.scoring.officialEvaluatorAttempted === true && + row.scoring.officialEvaluatorInvoked === true && + row.scoring.claimBearing === true && + score?.officialEvaluatorInvoked === true && + score.claimBearing === true && + score.mode === 'official_evaluator' && + score.status === 'completed' && + score.exitCode === 0 && + typeof score.command === 'string' && + score.command.includes('contextbench.evaluate') && + typeof score.outputPath === 'string' && + score.outputPath.length > 0 && + phase42HasSha256Hash(score.outputHash) && + artifactHashesByPath[score.outputPath] === score.outputHash && + phase42HasSha256Hash(artifactHashesByPath[row.score_path]) && + typeof score.stdoutPath === 'string' && + score.stdoutPath.length > 0 && + phase42HasSha256Hash(score.stdoutHash) && + artifactHashesByPath[score.stdoutPath] === score.stdoutHash && + phase42HasSha256Hash(artifactHashesByPath[score.stdoutPath]) && + typeof score.stderrPath === 'string' && + score.stderrPath.length > 0 && + phase42HasSha256Hash(score.stderrHash) && + artifactHashesByPath[score.stderrPath] === score.stderrHash && + phase42HasSha256Hash(artifactHashesByPath[score.stderrPath]) + ); +} + +function phase42HasDiagnosticFallback(row, score) { + return row.scoring.claimBearing === false || Boolean(row.scoring.fallbackReason) || score?.mode === 'diagnostic_fallback'; +} + +function phase42HasLaneIsolationProof(row, isolation, policy) { + if (!isolation?.proven || !policy) return false; + if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) return false; + if (policy.laneId !== row.lane_id || isolation.laneId !== row.lane_id) return false; + if (isolation.expectedContextTool !== policy.expectedContextTool) return false; + if (isolation.allowedTools.length === 0 || isolation.observedTools.length === 0) return false; + if (isolation.violations && isolation.violations.length > 0) return false; + if (policy.disallowedTools.some((tool) => isolation.observedTools.includes(tool))) return false; + if (isolation.allowedTools.some((tool) => !policy.allowedTools.includes(tool))) return false; + if (policy.allowMultipleObservedTools) { + return isolation.observedTools.every((tool) => policy.allowedTools.includes(tool)); + } + if (!isolation.allowedTools.includes(policy.expectedContextTool)) return false; + return isolation.observedTools.length === 1 && isolation.observedTools[0] === policy.expectedContextTool; +} + +function phase42HasRunnerProvenance(row, rawTrace, expectedRunnerHash) { + if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash) + return false; + return ( + rawTrace.executor === row.taskExecution.executor && + rawTrace.model === row.taskExecution.model && + rawTrace.runnerHash === expectedRunnerHash && + row.hashes.runnerSourceHash === expectedRunnerHash + ); +} + +function phase42Failure(row, code, message) { + return { + code, + runId: row.run_id, + laneId: row.lane_id, + taskId: row.task_id, + repeatIndex: row.repeat_index, + message + }; +} + +function evaluatePhase42EvidenceGate(input) { + const failures = []; + const expectedKeys = new Set(); + if (input.evidenceMode !== 'artifact_verified') { + failures.push({ + code: 'artifact_verification_missing', + message: 'Synthetic shape evidence cannot produce claim-bearing benchmark pass.' + }); + } + if (!input.protocol.claimAllowed) { + failures.push({ + code: 'protocol_claims_disabled', + message: 'The protocol does not currently allow claim-bearing benchmark results.' + }); + } + if (input.expectedTotalRows <= 0 || input.requiredLaneIds.length === 0 || input.requiredTaskIds.length === 0) { + failures.push({ + code: 'denominator_contract_missing', + message: 'Claim validation requires a frozen denominator contract.' + }); + } + if (input.rows.length !== input.expectedTotalRows) { + failures.push({ + code: 'denominator_count_mismatch', + message: 'Run row count does not match the frozen expected denominator count.' + }); + } + for (const laneId of input.requiredLaneIds) { + for (const taskId of input.requiredTaskIds) { + for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) { + expectedKeys.add(`${laneId}\u0000${taskId}\u0000${repeatIndex}`); + } + } + } + const rowCounts = new Map(); + for (const row of input.rows) { + const key = phase42RowKey(row); + rowCounts.set(key, (rowCounts.get(key) ?? 0) + 1); + if (!expectedKeys.has(key)) { + failures.push( + phase42Failure(row, 'unexpected_run_row', 'Rows outside the required denominator must not be hidden from claim validation.') + ); + } + if (row.protocol_hash !== input.expectedProtocolHash) { + failures.push(phase42Failure(row, 'protocol_hash_mismatch', 'Row protocol hash does not match the frozen protocol hash.')); + } + if (row.task_manifest_hash !== input.expectedTaskManifestHash) { + failures.push(phase42Failure(row, 'task_manifest_hash_mismatch', 'Row task manifest hash does not match the frozen task manifest hash.')); + } + } + for (const row of input.rows) { + if ((rowCounts.get(phase42RowKey(row)) ?? 0) > 1) { + failures.push(phase42Failure(row, 'duplicate_required_run', 'Duplicate lane/task/repeat rows make the evidence denominator ambiguous.')); + } + } + if (!input.expectedRunnerHash || !input.currentRunnerHash) { + failures.push({ + code: 'runner_provenance_missing', + message: 'Expected and current runner hashes are required for claim-bearing validation.' + }); + } else if (input.expectedRunnerHash !== input.currentRunnerHash) { + failures.push({ + code: 'runner_provenance_mismatch', + message: 'Current runner hash does not match the expected generation runner hash.' + }); + } + for (const laneId of input.requiredLaneIds) { + for (const taskId of input.requiredTaskIds) { + for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) { + const row = input.rows.find( + (candidate) => + candidate.lane_id === laneId && + candidate.task_id === taskId && + candidate.repeat_index === repeatIndex + ); + if (!row) { + failures.push({ + code: 'missing_required_run', + laneId, + taskId, + repeatIndex, + message: 'A required lane/task/repeat row is missing from the evidence denominator.' + }); + continue; + } + const artifacts = input.artifactsByRunId[row.run_id]; + if (row.status !== 'completed') { + failures.push(phase42Failure(row, 'non_completed_status', 'Claim-bearing runs must complete.')); + } + if ( + input.protocol.benchmarkTarget.officialEvaluatorFirst && + !phase42HasOfficialEvaluatorProof(row, artifacts?.score, input.artifactHashesByPath) + ) { + failures.push(phase42Failure(row, 'official_evaluator_missing', 'Official evaluator proof is required before this row can support claims.')); + } + if (phase42HasDiagnosticFallback(row, artifacts?.score)) { + failures.push(phase42Failure(row, 'diagnostic_fallback_only', 'Diagnostic fallback scoring cannot satisfy the claim-bearing evidence gate.')); + } + if (!phase42HasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])) { + failures.push( + phase42Failure( + row, + artifacts?.laneIsolation?.violations?.length ? 'lane_isolation_violation' : 'lane_isolation_missing', + 'Lane isolation must be proven by explicit allowed/observed tool evidence.' + ) + ); + } + if (!phase42HasMeasuredSetupIndex(row, artifacts?.setupIndex)) { + failures.push(phase42Failure(row, 'setup_index_cost_missing', 'Setup/index statuses, durations, and log references are required.')); + } + if (!phase42HasRunnerProvenance(row, artifacts?.rawTrace, input.expectedRunnerHash)) { + failures.push(phase42Failure(row, 'runner_provenance_mismatch', 'Raw trace executor/model metadata must match the manifest row.')); + } + } + } + } + const blockingFailures = failures.filter((failure) => failure.code !== 'artifact_verification_missing'); + return { + shapePass: blockingFailures.length === 0, + claimPass: failures.length === 0, + diagnosticOnly: failures.length > 0, + failures + }; +} + +function countBy(values) { + return values.reduce((counts, value) => { + counts[value] = (counts[value] ?? 0) + 1; + return counts; + }, {}); +} + +function phase42LoaderFailures(loaded, sessionRoot) { + const failures = []; + const indexedPaths = new Set((loaded.session?.artifactIndex ?? []).map((artifact) => artifact.path)); + const registeredArmIds = existsSync(FIXTURES.codebaseContextBaselineArms) + ? new Set((readJson(FIXTURES.codebaseContextBaselineArms).arms ?? []).map((arm) => arm.baselineArmId)) + : new Set(); + if (!loaded.session) { + failures.push({ code: 'session_missing', message: 'BASELINE-SESSION.json is required.' }); + } else { + const expectedSessionHash = computeSessionHash(loaded.session); + if (loaded.session.sessionHash !== expectedSessionHash) { + failures.push({ code: 'session_hash_mismatch', message: 'Session hash does not match BASELINE-SESSION.json content.' }); + } + if (loaded.session.sealed !== true) { + failures.push({ code: 'session_not_sealed', message: 'Claim-bearing Phase 42 verification requires a sealed session.' }); + } + for (const artifact of loaded.session.artifactIndex ?? []) { + const artifactPath = join(sessionRoot, artifact.path); + if (!existsSync(artifactPath)) { + failures.push({ code: 'session_artifact_missing', path: artifact.path, message: 'Indexed session artifact is missing.' }); + } else { + const actualHash = hashFile(artifactPath); + if (actualHash !== artifact.hash) { + failures.push({ + code: 'session_artifact_hash_mismatch', + path: artifact.path, + message: 'Indexed session artifact hash does not match current file content.' + }); + } + } + } + } + for (const error of loaded.readErrors) { + failures.push({ + code: 'artifact_read_error', + runId: error.runId, + path: error.path, + message: `Required artifact could not be read: ${error.reason}` + }); + } + for (const error of loaded.integrityErrors) { + failures.push({ + code: 'artifact_hash_mismatch', + runId: error.runId, + path: error.path, + message: `Manifest artifact hash mismatch: ${error.reason}` + }); + } + for (const row of [...loaded.requiredRows, ...loaded.supplementalRows, ...loaded.unexpectedRows]) { + for (const key of [ + 'raw_trace_path', + 'structured_answer_path', + 'trajectory_path', + 'score_path', + 'setup_index_path', + 'prompt_path', + 'lane_tool_card_path' + ]) { + const value = row[key]; + const relativePath = value && isAbsolute(value) ? normalizePath(relative(sessionRoot, value)) : null; + if (!value || !isAbsolute(value)) { + failures.push(phase42Failure(row, 'artifact_path_invalid', `${key} must be absolute.`)); + } else if (!isPathInside(sessionRoot, value)) { + failures.push(phase42Failure(row, 'artifact_path_outside_session', `${key} must stay inside the session root.`)); + } else if (!relativePath || !indexedPaths.has(relativePath)) { + failures.push(phase42Failure(row, 'artifact_not_indexed', `${key} must be present in the sealed session artifact index.`)); + } + } + } + for (const row of loaded.unexpectedRows) { + failures.push( + phase42Failure( + row, + 'unexpected_run_row', + 'Rows outside the required denominator must be explicit registered diagnostic arms.' + ) + ); + } + for (const row of loaded.supplementalRows) { + const baselineArmId = row.scoring?.baselineArmId; + if ( + row.lane_id !== 'codebase-context' || + row.scoring?.claimBearing !== false || + typeof baselineArmId !== 'string' || + !registeredArmIds.has(baselineArmId) || + !row.run_id.startsWith(`${baselineArmId}-`) + ) { + failures.push( + phase42Failure( + row, + 'invalid_supplemental_row', + 'Supplemental diagnostic rows must be non-claim-bearing registered codebase-context arms.' + ) + ); + } + } + return failures; +} + +function verifyPhase42Session(args) { + if (!args.session) throw new Error('--phase42-verify requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + const fixtures = validateFixtures(); + const loaded = loadPhase42SessionEvidence(sessionRoot, fixtures); + const gate = evaluatePhase42EvidenceGate(loaded.gateInput); + const failures = [...gate.failures, ...phase42LoaderFailures(loaded, sessionRoot)]; + const claimPass = failures.length === 0; + const shapePass = failures.filter((failure) => failure.code !== 'artifact_verification_missing').length === 0; + const failureCounts = countBy(failures.map((failure) => failure.code)); + const report = { + generatedAt: new Date().toISOString(), + sessionRoot: normalizePath(sessionRoot), + claimBearing: claimPass, + claimPass, + shapePass, + diagnosticOnly: !claimPass, + protocolClaimAllowed: fixtures.protocol.claimAllowed, + expectedTotalRows: loaded.gateInput.expectedTotalRows, + requiredRows: loaded.requiredRows.length, + supplementalRows: loaded.supplementalRows.length, + unexpectedRows: loaded.unexpectedRows.length, + reservations: loaded.reservations.length, + sessionSealed: loaded.session?.sealed ?? false, + rowStatusCounts: countBy(loaded.requiredRows.map((row) => row.status)), + laneStatusCounts: countBy(loaded.requiredRows.map((row) => `${row.lane_id}:${row.status}`)), + failureCounts, + readErrors: loaded.readErrors, + integrityErrors: loaded.integrityErrors, + runnerHashes: { + expected: loaded.gateInput.expectedRunnerHash ?? null, + current: loaded.gateInput.currentRunnerHash ?? null + }, + fixtureHashes: { + protocol: loaded.gateInput.expectedProtocolHash, + taskManifest: loaded.gateInput.expectedTaskManifestHash + }, + safeClaims: claimPass + ? ['Phase 42 evidence gate passed for this sealed artifact set'] + : [ + 'harness repair in progress', + 'diagnostic artifact', + 'non-claim-bearing provenance evidence', + 'blocked pending verifier/challenger' + ], + blockedClaims: [ + ...(claimPass ? [] : ['Phase 42 passed']), + 'benchmark win', + 'competitor loss', + 'agent-outcome improvement', + 'product change authorized by evidence', + 'setup_failed is a loss' + ], + failures + }; + if (args.out) writeJson(resolve(args.out), report); + if (args.quiet) { + console.log( + `phase42 verification ${claimPass ? 'passed' : 'failed'}: requiredRows=${report.requiredRows}/${report.expectedTotalRows}, supplementalRows=${report.supplementalRows}` + ); + } else { + console.log(JSON.stringify(report, null, 2)); + } + if (!claimPass) { + throw new Error( + `phase42 verification failed: ${Object.entries(failureCounts) + .map(([code, count]) => `${code}=${count}`) + .join(', ')}` + ); + } +} + +function validateBaselineSession(args) { + if (!args.session) throw new Error('--baseline-validate requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + const fixtures = validateFixtures(); + const sessionPath = join(sessionRoot, 'BASELINE-SESSION.json'); + const reservationPath = join(sessionRoot, 'slot-reservations.json'); + const errors = []; + if (!existsSync(sessionPath)) errors.push('BASELINE-SESSION.json missing'); + if (!existsSync(reservationPath)) errors.push('slot-reservations.json missing'); + if (errors.length === 0) { + const session = readJson(sessionPath); + const expectedHash = computeSessionHash(session); + if (session.sessionHash !== expectedHash) errors.push('session hash mismatch'); + if (session.claimBearing !== false) errors.push('session must be non-claim-bearing'); + if (!session.snapshot?.snapshotHash) errors.push('snapshot hash missing'); + if ( + session.snapshot?.redactedEnvVarNames?.some( + (name) => + String(process.env[name] ?? '').length > 0 && + JSON.stringify(session).includes(String(process.env[name])) + ) + ) { + errors.push('session appears to include an environment secret value'); + } + for (const artifact of session.artifactIndex ?? []) { + const artifactPath = join(sessionRoot, artifact.path); + if (!existsSync(artifactPath)) errors.push(`indexed artifact missing: ${artifact.path}`); + else if (hashFile(artifactPath) !== artifact.hash) + errors.push(`indexed artifact hash mismatch: ${artifact.path}`); + } + } + const reservations = existsSync(reservationPath) + ? (readJson(reservationPath).reservations ?? []) + : []; + const expectedSlots = + fixtures.manifest.tasks.length * + fixtures.lanes.broadClaimLaneSet.length * + (fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? 3); + if (reservations.length !== expectedSlots) + errors.push(`expected ${expectedSlots} reserved slots, found ${reservations.length}`); + const rows = readManifestRowsIfPresent(sessionRoot); + validateSessionPaths(sessionRoot, rows, errors); + const blockedReservations = reservations.filter( + (slot) => slot.status === 'terminal_missing_evidence' + ); + const blockedRows = rows.filter( + (row) => + row.status === 'setup_failed' && ['grepai', 'codebase-memory-mcp'].includes(row.lane_id) + ); + if (blockedRows.length !== blockedReservations.length) { + errors.push('terminal missing-evidence rows must be present for every blocked reservation'); + } + if (errors.length > 0) + throw new Error(`baseline session validation failed:\n- ${errors.join('\n- ')}`); + console.log('baseline session validation passed'); +} + +function sealBaselineSession(args) { + if (!args.session) throw new Error('--baseline-seal requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + const session = readSession(sessionRoot); + const reservations = readJson(join(sessionRoot, 'slot-reservations.json')).reservations ?? []; + const rows = readManifestRowsIfPresent(sessionRoot); + const rowKeys = new Set(rows.map((row) => `${row.lane_id}:${row.task_id}:${row.repeat_index}`)); + const missing = reservations.filter( + (slot) => !rowKeys.has(`${slot.laneId}:${slot.taskId}:${slot.repeatIndex}`) + ); + if (missing.length > 0) + throw new Error(`cannot seal baseline session; ${missing.length} slots lack terminal evidence`); + session.sealed = true; + session.artifactIndex = refreshArtifactIndex(sessionRoot); + writeSession(sessionRoot, session); + validateBaselineSession({ session: sessionRoot }); + try { + verifyPhase42Session({ session: sessionRoot, quiet: true }); + } catch (error) { + throw new Error(`baseline seal blocked by Phase 42 evidence gate: ${error.message}`); + } + console.log(`baseline session sealed ${join(sessionRoot, 'BASELINE-SESSION.json')}`); +} + +function refreshBaselineSession(args) { + if (!args.session) throw new Error('--baseline-refresh requires --session '); + const sessionRoot = ensureBaselineSessionRoot(args.session); + const session = readSession(sessionRoot); + session.sealed = false; + session.artifactIndex = refreshArtifactIndex(sessionRoot); + writeSession(sessionRoot, session); + console.log(`baseline session refreshed ${join(sessionRoot, 'BASELINE-SESSION.json')}`); +} + +function runDryRun(args) { + if (args.executor !== 'fake') throw new Error('--dry-run currently requires --executor fake'); + if (!args.lane || !args.taskId || !args.out) + throw new Error('--dry-run requires --lane, --task-id, and --out'); + const fixtures = validateFixtures(); + const laneCard = fixtures.laneToolCards.cards.find((card) => card.laneId === args.lane); + if (!laneCard) throw new Error(`unknown lane: ${args.lane}`); + if (!laneCard.executableInPhase38) + throw new Error(`lane ${args.lane} is pending Phase 39 and is not executable in Phase 38`); + const task = fixtures.manifest.tasks.find((candidate) => candidate.instance_id === args.taskId); + if (!task) throw new Error(`unknown task-id: ${args.taskId}`); + + const outDir = resolve(args.out); + const repeat = Number.isInteger(args.repeat) && args.repeat > 0 ? args.repeat : 1; + const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeat}-fake`); + const runDir = join(outDir, 'runs', runId); + const paths = { + prompt: join(runDir, 'prompt.txt'), + laneCard: join(runDir, 'lane-card.json'), + setupIndex: join(runDir, 'setup-index.json'), + rawTrace: join(runDir, 'raw-trace.json'), + structuredAnswer: join(runDir, 'structured-answer.json'), + trajectory: join(runDir, 'trajectory.json'), + score: join(runDir, 'score.json'), + manifest: join(outDir, 'run-manifest.jsonl') + }; + const startedAt = new Date().toISOString(); + const prompt = [ + `Task: ${task.instance_id}`, + `Lane: ${laneCard.laneId}`, + 'Return only structured JSON with answer, confidence, evidence, filesReferenced, symbolsReferenced, unsupportedClaims, readyToEdit.', + 'Do not use tools outside the lane tool card.' + ].join('\n'); + const answer = { + answer: { smoke: true, taskId: task.instance_id }, + confidence: 'medium', + evidence: [ + { + file: 'SMOKE_ONLY.md', + lineRange: { start: 1, end: 1 }, + reason: 'fake executor non-claim-bearing smoke evidence' + } + ], + filesReferenced: ['SMOKE_ONLY.md'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }; + const trajectory = buildTrajectory(task, answer); + const rawTrace = { + executor: 'fake', + runnerHash: runnerSourceHash(), + claimBearing: false, + stdout: JSON.stringify(answer), + stderr: '', + toolCalls: [], + laneIsolation: buildLaneIsolationEvidence(laneCard), + scriptedAgentDecisions: false + }; + const score = { + status: 'completed', + mode: 'phase38_smoke_no_official_claim', + ...diagnosticFallbackScoring(fixtures, 'dry_run_fake_executor_smoke_only') + }; + const setupIndex = { + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupStatus: laneCard.setupCommand === 'none' ? 'not_required' : 'completed', + indexStatus: laneCard.indexCommand === 'none' ? 'not_required' : 'completed' + }; + mkdirSync(runDir, { recursive: true }); + writeFileSync(paths.prompt, prompt, 'utf8'); + writeJson(paths.laneCard, laneCard); + writeJson(paths.setupIndex, setupIndex); + writeJson(paths.rawTrace, rawTrace); + writeJson(paths.structuredAnswer, answer); + writeJson(paths.trajectory, trajectory); + writeJson(paths.score, score); + const completedAt = new Date().toISOString(); + const row = { + run_id: runId, + protocol_version: fixtures.protocol.protocolVersion, + protocol_hash: hashObject(fixtures.protocol), + task_manifest_hash: fixtures.manifest.manifest_hash, + lane_id: laneCard.laneId, + task_id: task.instance_id, + repeat_index: repeat, + status: 'completed', + started_at: startedAt, + completed_at: completedAt, + raw_trace_path: paths.rawTrace, + structured_answer_path: paths.structuredAnswer, + trajectory_path: paths.trajectory, + score_path: paths.score, + setup_index_path: paths.setupIndex, + prompt_path: paths.prompt, + lane_tool_card_path: paths.laneCard, + setupIndex, + taskExecution: { + model: 'fake-executor', + timeoutSeconds: fixtures.protocol.budgets.defaults.timeoutSeconds, + maxContextTokens: fixtures.protocol.budgets.defaults.maxContextTokens, + maxAnswerTokens: fixtures.protocol.budgets.defaults.maxAnswerTokens, + startedAt, + completedAt, + taskWallTimeMs: new Date(completedAt).getTime() - new Date(startedAt).getTime(), + executor: 'fake' + }, + scoring: diagnosticFallbackScoring(fixtures, 'dry_run_fake_executor_smoke_only'), + hashes: { + prompt: sha256(prompt), + laneToolCard: hashObject(laneCard), + structuredAnswer: hashObject(answer), + trajectory: hashObject(trajectory), + score: hashObject(score), + runnerSourceHash: runnerSourceHash() + } + }; + mkdirSync(dirname(paths.manifest), { recursive: true }); + appendFileSync(paths.manifest, `${JSON.stringify(row)}\n`, 'utf8'); + console.log(`dry-run wrote ${runDir}`); +} + +function runScoreProbe(args) { + if (!args.out) throw new Error('--score-probe requires --out '); + const fixtures = validateFixtures(); + const outDir = resolve(args.out); + const goldPath = join(outDir, 'synthetic-gold.json'); + const predPath = join(outDir, 'synthetic-prediction.json'); + const scorePath = join(outDir, 'score.json'); + writeJson(goldPath, { synthetic: true, claimBearing: false }); + writeJson(predPath, { synthetic: true, claimBearing: false }); + const score = { + status: 'judge_failed', + mode: 'diagnostic_fallback', + stdout: '', + stderr: 'mock official evaluator unavailable in Phase 38 score probe', + exitStatus: 1, + ...diagnosticFallbackScoring( + fixtures, + 'mocked_official_evaluator_failure_for_non_claim_probe' + ) + }; + writeJson(scorePath, score); + console.log(`score-probe wrote ${scorePath}`); +} + +async function main() { + const args = parseArgs(process.argv.slice(2)); + if (args.help || process.argv.length <= 2) { + help(); + return; + } + if (args.validateFixtures) { + validateFixtures(); + console.log('fixture validation passed'); + return; + } + if (args.validateLaneSetup) { + validateLaneSetupEvidence(); + console.log('lane setup validation passed'); + return; + } + if (args.baselineSnapshot) { + createBaselineSnapshot(args); + return; + } + if (args.setupIndexMeasure) { + runSetupIndexMeasure(args); + return; + } + if (args.setupIndexImport) { + runSetupIndexImport(args); + return; + } + if (args.baselineRun) { + runBaseline(args); + return; + } + if (args.baselineRefresh) { + refreshBaselineSession(args); + return; + } + if (args.baselineRunCodebaseContextArms) { + runBaselineCodebaseContextArms(args); + return; + } + if (args.baselineSeal) { + sealBaselineSession(args); + return; + } + if (args.baselineValidate) { + validateBaselineSession(args); + return; + } + if (args.phase42Verify) { + verifyPhase42Session(args); + return; + } + if (args.baselineValidateArms) { + validateBaselineArms(args.baselineValidateArms); + return; + } + if (args.printClaudeArgs) { + console.log(JSON.stringify(claudeArgsForModel(args.model ?? ''), null, 2)); + return; + } + if (args.printAnswerSchema) { + console.log(JSON.stringify(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA, null, 2)); + return; + } + if (args.dryRun) { + runDryRun(args); + return; + } + if (args.scoreProbe) { + runScoreProbe(args); + return; + } + throw new Error('No mode selected. Use --help.'); +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; +}); diff --git a/src/eval/contextbench-answer.ts b/src/eval/contextbench-answer.ts new file mode 100644 index 0000000..6c9b55d --- /dev/null +++ b/src/eval/contextbench-answer.ts @@ -0,0 +1,229 @@ +import type { + ContextBenchEvidenceReference, + ContextBenchStructuredAnswer, + JsonSchemaDefinition, + JsonValue +} from './contextbench-types.js'; + +export interface StructuredAnswerParseResult { + status: 'valid' | 'invalid_schema'; + answer: ContextBenchStructuredAnswer | null; + errors: string[]; +} + +export interface SchemaBoundDiagnostics { + missingRequiredFacts?: string[]; + contradictoryFacts?: string[]; + missingEvidenceFiles?: string[]; + unsupportedEvidenceFiles?: string[]; +} + +export interface AnswerClassification { + unsupportedClaim: boolean; + falseReady: boolean; + reasons: string[]; +} + +const confidenceValues = new Set(['low', 'medium', 'high']); + +const evidenceReferenceFields = new Set(['file', 'lineRange', 'reason']); +const lineRangeFields = new Set(['start', 'end']); + +export const CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS = [ + 'answer', + 'confidence', + 'evidence', + 'filesReferenced', + 'symbolsReferenced', + 'unsupportedClaims', + 'readyToEdit' +] as const; + +export const CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA = { + type: 'object', + additionalProperties: false, + required: [...CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS], + properties: { + answer: { type: ['object', 'array', 'string', 'number', 'boolean', 'null'] }, + confidence: { type: 'string', enum: ['low', 'medium', 'high'] }, + evidence: { + type: 'array', + items: { + type: 'object', + additionalProperties: false, + required: ['file', 'lineRange', 'reason'], + properties: { + file: { type: 'string', minLength: 1 }, + lineRange: { + type: 'object', + additionalProperties: false, + required: ['start', 'end'], + properties: { + start: { type: 'integer', minimum: 1 }, + end: { type: 'integer', minimum: 1 } + } + }, + reason: { type: 'string', minLength: 1 } + } + } + }, + filesReferenced: { type: 'array', items: { type: 'string' } }, + symbolsReferenced: { type: 'array', items: { type: 'string' } }, + unsupportedClaims: { type: 'array', items: { type: 'string' } }, + readyToEdit: { type: 'boolean' } + } +} satisfies JsonSchemaDefinition; + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value); +} + +function isStringArray(value: unknown): value is string[] { + return Array.isArray(value) && value.every((entry) => typeof entry === 'string'); +} + +function findAdditionalFields( + value: Record, + allowedFields: ReadonlySet, + prefix: string +): string[] { + return Object.keys(value) + .filter((field) => !allowedFields.has(field)) + .map((field) => `additional_${prefix}_${field}`); +} + +function isJsonValue(value: unknown): value is JsonValue { + if (value === null) return true; + if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') + return true; + if (Array.isArray(value)) return value.every(isJsonValue); + if (!isRecord(value)) return false; + return Object.values(value).every(isJsonValue); +} + +export function isValidEvidenceReference(value: unknown): value is ContextBenchEvidenceReference { + if (!isRecord(value)) return false; + if (findAdditionalFields(value, evidenceReferenceFields, 'evidence_field').length > 0) return false; + const lineRange = value.lineRange; + if (!isRecord(lineRange)) return false; + if (findAdditionalFields(lineRange, lineRangeFields, 'line_range_field').length > 0) + return false; + const start = lineRange.start; + const end = lineRange.end; + return ( + typeof value.file === 'string' && + value.file.trim().length > 0 && + typeof value.reason === 'string' && + value.reason.trim().length > 0 && + Number.isInteger(start) && + Number.isInteger(end) && + typeof start === 'number' && + typeof end === 'number' && + start > 0 && + end >= start + ); +} + +function validateStructuredAnswer(value: unknown): StructuredAnswerParseResult { + const errors: string[] = []; + if (!isRecord(value)) { + return { status: 'invalid_schema', answer: null, errors: ['answer_root_not_object'] }; + } + + for (const field of CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS) { + if (!(field in value)) errors.push(`missing_${field}`); + } + errors.push( + ...findAdditionalFields(value, new Set(CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS), 'root_field') + ); + + if (!isJsonValue(value.answer)) errors.push('answer_not_json_value'); + if (typeof value.confidence !== 'string' || !confidenceValues.has(value.confidence)) + errors.push('invalid_confidence'); + if (!Array.isArray(value.evidence)) errors.push('evidence_not_array'); + if (!isStringArray(value.filesReferenced)) errors.push('files_referenced_not_string_array'); + if (!isStringArray(value.symbolsReferenced)) errors.push('symbols_referenced_not_string_array'); + if (!isStringArray(value.unsupportedClaims)) errors.push('unsupported_claims_not_string_array'); + if (typeof value.readyToEdit !== 'boolean') errors.push('ready_to_edit_not_boolean'); + + const evidence = Array.isArray(value.evidence) ? value.evidence : []; + for (const entry of evidence) { + if (!isRecord(entry)) continue; + errors.push(...findAdditionalFields(entry, evidenceReferenceFields, 'evidence_field')); + if (isRecord(entry.lineRange)) { + errors.push(...findAdditionalFields(entry.lineRange, lineRangeFields, 'line_range_field')); + } + } + const malformedEvidence = evidence.some((entry) => !isValidEvidenceReference(entry)); + if (malformedEvidence) errors.push('malformed_evidence_reference'); + + if (errors.length > 0) return { status: 'invalid_schema', answer: null, errors }; + + return { + status: 'valid', + answer: { + answer: value.answer as JsonValue, + confidence: value.confidence as ContextBenchStructuredAnswer['confidence'], + evidence: evidence as ContextBenchEvidenceReference[], + filesReferenced: value.filesReferenced as string[], + symbolsReferenced: value.symbolsReferenced as string[], + unsupportedClaims: value.unsupportedClaims as string[], + readyToEdit: value.readyToEdit as boolean + }, + errors: [] + }; +} + +export function parseStructuredAnswer(raw: string): StructuredAnswerParseResult { + const trimmed = raw.trim(); + if (trimmed.length === 0) + return { status: 'invalid_schema', answer: null, errors: ['missing_json'] }; + try { + return validateStructuredAnswer(JSON.parse(trimmed) as unknown); + } catch { + return { status: 'invalid_schema', answer: null, errors: ['invalid_json'] }; + } +} + +export function classifyStructuredAnswer( + answer: ContextBenchStructuredAnswer, + diagnostics: SchemaBoundDiagnostics = {} +): AnswerClassification { + const reasons: string[] = []; + const malformedEvidence = answer.evidence.some((entry) => !isValidEvidenceReference(entry)); + if (answer.unsupportedClaims.length > 0) reasons.push('model_reported_unsupported_claims'); + if ((diagnostics.unsupportedEvidenceFiles?.length ?? 0) > 0) + reasons.push('unsupported_evidence_files'); + if ((diagnostics.missingRequiredFacts?.length ?? 0) > 0) reasons.push('missing_required_facts'); + if ((diagnostics.contradictoryFacts?.length ?? 0) > 0) reasons.push('contradictory_facts'); + if ((diagnostics.missingEvidenceFiles?.length ?? 0) > 0) reasons.push('missing_evidence_files'); + + const unsupportedClaim = reasons.length > 0; + if (answer.readyToEdit && answer.confidence === 'low') reasons.push('ready_with_low_confidence'); + if (answer.readyToEdit && answer.evidence.length === 0) reasons.push('ready_without_evidence'); + if (answer.readyToEdit && malformedEvidence) reasons.push('ready_with_malformed_evidence'); + + const falseReady = + answer.readyToEdit && + (unsupportedClaim || + answer.confidence === 'low' || + answer.evidence.length === 0 || + malformedEvidence); + return { unsupportedClaim, falseReady, reasons: [...new Set(reasons)] }; +} + +export function evaluateSchemaBoundDiagnostics( + answer: ContextBenchStructuredAnswer, + expected: { requiredFacts?: string[]; requiredEvidenceFiles?: string[] } +): SchemaBoundDiagnostics { + const answerText = JSON.stringify(answer.answer).toLowerCase(); + const citedFiles = new Set(answer.evidence.map((entry) => entry.file)); + return { + missingRequiredFacts: (expected.requiredFacts ?? []).filter( + (fact) => !answerText.includes(fact.toLowerCase()) + ), + missingEvidenceFiles: (expected.requiredEvidenceFiles ?? []).filter( + (file) => !citedFiles.has(file) + ) + }; +} diff --git a/src/eval/contextbench-artifacts.ts b/src/eval/contextbench-artifacts.ts new file mode 100644 index 0000000..e888b34 --- /dev/null +++ b/src/eval/contextbench-artifacts.ts @@ -0,0 +1,184 @@ +import { createHash } from 'node:crypto'; +import { appendFileSync, mkdirSync, readFileSync, statSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; +import type { + ContextBenchArtifactIndexEntry, + ContextBenchExecutor, + ContextBenchLaneSetupEvidenceRecord, + ContextBenchLaneToolCard, + ContextBenchRunManifestRow, + ContextBenchTerminalStatus, + ContextBenchTaskIdentity +} from './contextbench-types.js'; + +export interface ArtifactPathSet { + runDir: string; + manifestPath: string; + promptPath: string; + laneToolCardPath: string; + setupIndexPath: string; + rawTracePath: string; + structuredAnswerPath: string; + trajectoryPath: string; + scorePath: string; +} + +export function stableStringify(value: unknown): string { + if (value === null || typeof value !== 'object') return JSON.stringify(value); + if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(',')}]`; + const entries = Object.entries(value as Record).sort(([a], [b]) => + a.localeCompare(b) + ); + return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(',')}}`; +} + +export function sha256Text(value: string): string { + return `sha256:${createHash('sha256').update(value, 'utf8').digest('hex')}`; +} + +export function sha256Buffer(value: Buffer): string { + return `sha256:${createHash('sha256').update(value).digest('hex')}`; +} + +export function sha256File(filePath: string): string { + return sha256Buffer(readFileSync(filePath)); +} + +export function hashJson(value: unknown): string { + return sha256Text(stableStringify(value)); +} + +export function hashSetupEvidenceRecord(record: ContextBenchLaneSetupEvidenceRecord): string { + const evidenceWithoutHash: Omit = { + ...record + }; + delete (evidenceWithoutHash as Partial).evidenceHash; + return hashJson(evidenceWithoutHash); +} + +function sanitize(value: string): string { + return value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, ''); +} + +export function buildRunId(params: { + laneId: string; + taskId: string; + repeatIndex: number; + executor: string; +}): string { + const base = `${params.laneId}-${params.taskId}-${params.repeatIndex}-${params.executor}`; + return sanitize(base).slice(0, 160); +} + +export function createArtifactPathSet(outDir: string, runId: string): ArtifactPathSet { + const runDir = path.join(outDir, 'runs', runId); + return { + runDir, + manifestPath: path.join(outDir, 'run-manifest.jsonl'), + promptPath: path.join(runDir, 'prompt.txt'), + laneToolCardPath: path.join(runDir, 'lane-card.json'), + setupIndexPath: path.join(runDir, 'setup-index.json'), + rawTracePath: path.join(runDir, 'raw-trace.json'), + structuredAnswerPath: path.join(runDir, 'structured-answer.json'), + trajectoryPath: path.join(runDir, 'trajectory.json'), + scorePath: path.join(runDir, 'score.json') + }; +} + +export function writeJsonArtifact(filePath: string, value: unknown): void { + mkdirSync(path.dirname(filePath), { recursive: true }); + writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8'); +} + +export function artifactIndexEntry( + filePath: string, + rootDir: string +): ContextBenchArtifactIndexEntry { + const stats = statSync(filePath); + return { + path: path.relative(rootDir, filePath).replace(/\\/g, '/'), + hash: sha256File(filePath), + bytes: stats.size + }; +} + +export function appendManifestRow(manifestPath: string, row: ContextBenchRunManifestRow): void { + mkdirSync(path.dirname(manifestPath), { recursive: true }); + appendFileSync(manifestPath, `${JSON.stringify(row)}\n`, 'utf8'); +} + +export function readManifestRows(manifestPath: string): ContextBenchRunManifestRow[] { + const content = readFileSync(manifestPath, 'utf8').trim(); + if (!content) return []; + return content.split('\n').map((line) => JSON.parse(line) as ContextBenchRunManifestRow); +} + +export function buildManifestRow(params: { + runId: string; + protocolVersion: string; + protocolHash: string; + taskManifestHash: string; + laneCard: ContextBenchLaneToolCard; + task: ContextBenchTaskIdentity; + repeatIndex: number; + status: ContextBenchTerminalStatus; + startedAt: string; + completedAt: string; + paths: ArtifactPathSet; + hashes: Record; + executor: ContextBenchExecutor; + model: string; + timeoutSeconds: number; + maxContextTokens: number; + maxAnswerTokens: number; +}): ContextBenchRunManifestRow { + return { + run_id: params.runId, + protocol_version: params.protocolVersion, + protocol_hash: params.protocolHash, + task_manifest_hash: params.taskManifestHash, + lane_id: params.laneCard.laneId, + task_id: params.task.instance_id, + repeat_index: params.repeatIndex, + status: params.status, + started_at: params.startedAt, + completed_at: params.completedAt, + raw_trace_path: params.paths.rawTracePath, + structured_answer_path: params.paths.structuredAnswerPath, + trajectory_path: params.paths.trajectoryPath, + score_path: params.paths.scorePath, + setup_index_path: params.paths.setupIndexPath, + prompt_path: params.paths.promptPath, + lane_tool_card_path: params.paths.laneToolCardPath, + setupIndex: { + setupCommand: params.laneCard.setupCommand, + indexCommand: params.laneCard.indexCommand, + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: params.paths.setupIndexPath, + indexLogPath: params.paths.setupIndexPath, + setupStatus: params.laneCard.setupCommand === 'none' ? 'not_required' : 'completed', + indexStatus: params.laneCard.indexCommand === 'none' ? 'not_required' : 'completed' + }, + taskExecution: { + model: params.model, + timeoutSeconds: params.timeoutSeconds, + maxContextTokens: params.maxContextTokens, + maxAnswerTokens: params.maxAnswerTokens, + startedAt: params.startedAt, + completedAt: params.completedAt, + taskWallTimeMs: new Date(params.completedAt).getTime() - new Date(params.startedAt).getTime(), + executor: params.executor + }, + scoring: { + officialEvaluatorFirst: false, + officialEvaluatorAttempted: false, + officialEvaluatorInvoked: false, + command: + 'python -m contextbench.evaluate --gold --pred --out ', + claimBearing: false, + fallbackReason: 'phase38_smoke_non_claim_bearing' + }, + hashes: params.hashes + }; +} diff --git a/src/eval/contextbench-evidence-gate.ts b/src/eval/contextbench-evidence-gate.ts new file mode 100644 index 0000000..815616e --- /dev/null +++ b/src/eval/contextbench-evidence-gate.ts @@ -0,0 +1,422 @@ +import type { ContextBenchRunManifestRow } from './contextbench-types.js'; + +export type ContextBenchEvidenceGateFailureCode = + | 'summary_not_claim_pass' + | 'artifact_verification_missing' + | 'protocol_claims_disabled' + | 'denominator_contract_missing' + | 'denominator_count_mismatch' + | 'protocol_hash_mismatch' + | 'task_manifest_hash_mismatch' + | 'missing_required_run' + | 'duplicate_required_run' + | 'unexpected_run_row' + | 'non_completed_status' + | 'official_evaluator_missing' + | 'diagnostic_fallback_only' + | 'lane_isolation_missing' + | 'lane_isolation_violation' + | 'setup_index_cost_missing' + | 'runner_provenance_missing' + | 'runner_provenance_mismatch'; + +export interface ContextBenchEvidenceGateFailure { + code: ContextBenchEvidenceGateFailureCode; + runId?: string; + laneId?: string; + taskId?: string; + repeatIndex?: number; + message: string; +} + +export interface ContextBenchEvidenceGateResult { + shapePass: boolean; + claimPass: boolean; + diagnosticOnly: boolean; + failures: ContextBenchEvidenceGateFailure[]; +} + +export type ContextBenchEvidenceMode = 'synthetic_shape' | 'artifact_verified'; + +export interface ContextBenchLaneEvidencePolicy { + laneId: string; + expectedContextTool: string; + allowedTools: string[]; + disallowedTools: string[]; + allowMultipleObservedTools?: boolean; +} + +export interface ContextBenchLaneIsolationEvidence { + laneId: string; + proven: boolean; + sourceKind?: 'not_captured' | 'env_override' | 'transcript' | 'proxy'; + expectedContextTool: string; + allowedTools: string[]; + observedTools: string[]; + violations?: string[]; +} + +export interface ContextBenchRawTraceEvidence { + executor?: string; + model?: string; + runnerHash?: string; +} + +export interface ContextBenchScoreEvidence { + status?: string; + mode?: string; + claimBearing?: boolean; + officialEvaluatorInvoked?: boolean; + command?: string; + exitCode?: number; + outputPath?: string; + outputHash?: string; + stdoutPath?: string; + stderrPath?: string; +} + +export interface ContextBenchSetupIndexEvidence { + setupStatus?: string; + indexStatus?: string; + setupDurationMs?: number; + indexDurationMs?: number; + setupLogPath?: string; + indexLogPath?: string; +} + +export interface ContextBenchRunEvidenceArtifacts { + rawTrace?: ContextBenchRawTraceEvidence; + score?: ContextBenchScoreEvidence; + setupIndex?: ContextBenchSetupIndexEvidence; + laneIsolation?: ContextBenchLaneIsolationEvidence; +} + +export interface ContextBenchEvidenceGateInput { + evidenceMode: ContextBenchEvidenceMode; + protocol: { + claimAllowed: boolean; + benchmarkTarget: { + officialEvaluatorFirst: boolean; + }; + }; + requiredLaneIds: string[]; + requiredTaskIds: string[]; + requiredRepeats: number; + expectedTotalRows: number; + expectedProtocolHash: string; + expectedTaskManifestHash: string; + lanePoliciesById: Record; + rows: ContextBenchRunManifestRow[]; + artifactsByRunId: Record; + artifactHashesByPath: Record; + expectedRunnerHash?: string; + currentRunnerHash?: string; +} + +function makeFailure( + row: Pick, + code: ContextBenchEvidenceGateFailureCode, + message: string +): ContextBenchEvidenceGateFailure { + return { + code, + runId: row.run_id, + laneId: row.lane_id, + taskId: row.task_id, + repeatIndex: row.repeat_index, + message + }; +} + +function hasMeasuredSetupIndex( + row: ContextBenchRunManifestRow, + evidence: ContextBenchSetupIndexEvidence | undefined +): boolean { + if (!evidence) return false; + const setupDuration = evidence.setupDurationMs; + const indexDuration = evidence.indexDurationMs; + if (typeof setupDuration !== 'number' || typeof indexDuration !== 'number') return false; + if (!Number.isFinite(setupDuration) || !Number.isFinite(indexDuration)) return false; + if (!evidence.setupStatus || !evidence.indexStatus) return false; + if (!evidence.setupLogPath || !evidence.indexLogPath) return false; + if (!['completed', 'not_required'].includes(evidence.setupStatus)) return false; + if (!['completed', 'not_required'].includes(evidence.indexStatus)) return false; + if (evidence.setupStatus === 'completed' && setupDuration <= 0) return false; + if (evidence.indexStatus === 'completed' && indexDuration <= 0) return false; + if (row.setupIndex.setupStatus !== evidence.setupStatus) return false; + if (row.setupIndex.indexStatus !== evidence.indexStatus) return false; + if (row.setupIndex.setupDurationMs !== evidence.setupDurationMs) return false; + if (row.setupIndex.indexDurationMs !== evidence.indexDurationMs) return false; + if (row.setupIndex.setupLogPath !== evidence.setupLogPath) return false; + if (row.setupIndex.indexLogPath !== evidence.indexLogPath) return false; + return true; +} + +function hasSha256Hash(value: string | undefined): boolean { + return /^sha256:[a-f0-9]{64}$/.test(value ?? ''); +} + +function hasOfficialEvaluatorProof( + row: ContextBenchRunManifestRow, + score: ContextBenchScoreEvidence | undefined, + artifactHashesByPath: Record +): boolean { + return ( + row.scoring.officialEvaluatorFirst === true && + row.scoring.officialEvaluatorAttempted === true && + row.scoring.officialEvaluatorInvoked === true && + row.scoring.claimBearing === true && + score?.officialEvaluatorInvoked === true && + score.claimBearing === true && + score.mode === 'official_evaluator' && + score.status === 'completed' && + score.exitCode === 0 && + typeof score.command === 'string' && + score.command.includes('contextbench.evaluate') && + typeof score.outputPath === 'string' && + score.outputPath.length > 0 && + hasSha256Hash(score.outputHash) && + artifactHashesByPath[score.outputPath] === score.outputHash && + hasSha256Hash(artifactHashesByPath[row.score_path]) && + typeof score.stdoutPath === 'string' && + score.stdoutPath.length > 0 && + hasSha256Hash(artifactHashesByPath[score.stdoutPath]) && + typeof score.stderrPath === 'string' && + score.stderrPath.length > 0 && + hasSha256Hash(artifactHashesByPath[score.stderrPath]) + ); +} + +function hasDiagnosticFallback(row: ContextBenchRunManifestRow, score: ContextBenchScoreEvidence | undefined): boolean { + return row.scoring.claimBearing === false || Boolean(row.scoring.fallbackReason) || score?.mode === 'diagnostic_fallback'; +} + +function hasLaneIsolationProof( + row: ContextBenchRunManifestRow, + isolation: ContextBenchLaneIsolationEvidence | undefined, + policy: ContextBenchLaneEvidencePolicy | undefined +): boolean { + if (!isolation?.proven) return false; + if (!policy) return false; + if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) return false; + if (policy.laneId !== row.lane_id) return false; + if (isolation.laneId !== row.lane_id) return false; + if (isolation.expectedContextTool !== policy.expectedContextTool) return false; + if (isolation.allowedTools.length === 0 || isolation.observedTools.length === 0) return false; + if (isolation.violations && isolation.violations.length > 0) return false; + if (policy.disallowedTools.some((tool) => isolation.observedTools.includes(tool))) return false; + if (isolation.allowedTools.some((tool) => !policy.allowedTools.includes(tool))) return false; + if (policy.allowMultipleObservedTools) { + return isolation.observedTools.every((tool) => policy.allowedTools.includes(tool)); + } + if (!isolation.allowedTools.includes(policy.expectedContextTool)) return false; + if (isolation.observedTools.length !== 1) return false; + return isolation.observedTools[0] === policy.expectedContextTool; +} + +function hasRunnerProvenance( + row: ContextBenchRunManifestRow, + rawTrace: ContextBenchRawTraceEvidence | undefined, + expectedRunnerHash: string | undefined +): boolean { + if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash) return false; + return ( + rawTrace.executor === row.taskExecution.executor && + rawTrace.model === row.taskExecution.model && + rawTrace.runnerHash === expectedRunnerHash && + row.hashes.runnerSourceHash === expectedRunnerHash + ); +} + +function rowKey(row: Pick): string { + return `${row.lane_id}\u0000${row.task_id}\u0000${row.repeat_index}`; +} + +export function evaluateContextBenchEvidenceGate( + input: ContextBenchEvidenceGateInput +): ContextBenchEvidenceGateResult { + const failures: ContextBenchEvidenceGateFailure[] = []; + const expectedKeys = new Set(); + + if (input.evidenceMode !== 'artifact_verified') { + failures.push({ + code: 'artifact_verification_missing', + message: 'Synthetic shape evidence cannot produce claim-bearing benchmark pass.' + }); + } + + if (!input.protocol.claimAllowed) { + failures.push({ + code: 'protocol_claims_disabled', + message: 'The protocol does not currently allow claim-bearing benchmark results.' + }); + } + + if (input.expectedTotalRows <= 0 || input.requiredLaneIds.length === 0 || input.requiredTaskIds.length === 0) { + failures.push({ + code: 'denominator_contract_missing', + message: 'Claim validation requires a frozen denominator contract.' + }); + } + + if (input.rows.length !== input.expectedTotalRows) { + failures.push({ + code: 'denominator_count_mismatch', + message: 'Run row count does not match the frozen expected denominator count.' + }); + } + + for (const laneId of input.requiredLaneIds) { + for (const taskId of input.requiredTaskIds) { + for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) { + expectedKeys.add(`${laneId}\u0000${taskId}\u0000${repeatIndex}`); + } + } + } + + const rowCounts = new Map(); + for (const row of input.rows) { + const key = rowKey(row); + rowCounts.set(key, (rowCounts.get(key) ?? 0) + 1); + if (!expectedKeys.has(key)) { + failures.push( + makeFailure( + row, + 'unexpected_run_row', + 'Rows outside the required denominator must not be hidden from claim validation.' + ) + ); + } + if (row.protocol_hash !== input.expectedProtocolHash) { + failures.push( + makeFailure(row, 'protocol_hash_mismatch', 'Row protocol hash does not match the frozen protocol hash.') + ); + } + if (row.task_manifest_hash !== input.expectedTaskManifestHash) { + failures.push( + makeFailure( + row, + 'task_manifest_hash_mismatch', + 'Row task manifest hash does not match the frozen task manifest hash.' + ) + ); + } + } + + for (const row of input.rows) { + if ((rowCounts.get(rowKey(row)) ?? 0) > 1) { + failures.push( + makeFailure( + row, + 'duplicate_required_run', + 'Duplicate lane/task/repeat rows make the evidence denominator ambiguous.' + ) + ); + } + } + + if (!input.expectedRunnerHash || !input.currentRunnerHash) { + failures.push({ + code: 'runner_provenance_missing', + message: 'Expected and current runner hashes are required for claim-bearing validation.' + }); + } else if (input.expectedRunnerHash !== input.currentRunnerHash) { + failures.push({ + code: 'runner_provenance_mismatch', + message: 'Current runner hash does not match the expected generation runner hash.' + }); + } + + for (const laneId of input.requiredLaneIds) { + for (const taskId of input.requiredTaskIds) { + for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) { + const matchingRows = input.rows.filter( + (candidate) => + candidate.lane_id === laneId && + candidate.task_id === taskId && + candidate.repeat_index === repeatIndex + ); + const row = matchingRows[0]; + + if (!row) { + failures.push({ + code: 'missing_required_run', + laneId, + taskId, + repeatIndex, + message: 'A required lane/task/repeat row is missing from the evidence denominator.' + }); + continue; + } + + const artifacts = input.artifactsByRunId[row.run_id]; + if (row.status !== 'completed') { + failures.push(makeFailure(row, 'non_completed_status', 'Claim-bearing runs must complete.')); + } + + if ( + input.protocol.benchmarkTarget.officialEvaluatorFirst && + !hasOfficialEvaluatorProof(row, artifacts?.score, input.artifactHashesByPath) + ) { + failures.push( + makeFailure( + row, + 'official_evaluator_missing', + 'Official evaluator proof is required before this row can support claims.' + ) + ); + } + + if (hasDiagnosticFallback(row, artifacts?.score)) { + failures.push( + makeFailure( + row, + 'diagnostic_fallback_only', + 'Diagnostic fallback scoring cannot satisfy the claim-bearing evidence gate.' + ) + ); + } + + if (!hasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])) { + failures.push( + makeFailure( + row, + artifacts?.laneIsolation?.violations?.length ? 'lane_isolation_violation' : 'lane_isolation_missing', + 'Lane isolation must be proven by explicit allowed/observed tool evidence.' + ) + ); + } + + if (!hasMeasuredSetupIndex(row, artifacts?.setupIndex)) { + failures.push( + makeFailure( + row, + 'setup_index_cost_missing', + 'Setup/index statuses, durations, and log references are required.' + ) + ); + } + + if (!hasRunnerProvenance(row, artifacts?.rawTrace, input.expectedRunnerHash)) { + failures.push( + makeFailure( + row, + 'runner_provenance_mismatch', + 'Raw trace executor/model metadata must match the manifest row.' + ) + ); + } + } + } + } + + const blockingFailures = failures.filter((failure) => failure.code !== 'artifact_verification_missing'); + const shapePass = blockingFailures.length === 0; + const claimPass = failures.length === 0; + return { + shapePass, + claimPass, + diagnosticOnly: !claimPass, + failures + }; +} diff --git a/src/eval/contextbench-scoring.ts b/src/eval/contextbench-scoring.ts new file mode 100644 index 0000000..8df61fb --- /dev/null +++ b/src/eval/contextbench-scoring.ts @@ -0,0 +1,107 @@ +import { mkdirSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; +import { classifyStructuredAnswer, evaluateSchemaBoundDiagnostics } from './contextbench-answer.js'; +import type { ContextBenchStructuredAnswer } from './contextbench-types.js'; + +export interface ProcessRunResult { + status: number | null; + stdout: string; + stderr: string; +} + +export type ContextBenchProcessRunner = ( + command: string, + args: string[], + cwd?: string +) => Promise; + +export interface OfficialEvaluatorParams { + goldPath: string; + predictionPath: string; + outputPath: string; + cachePath?: string; + cwd?: string; + runner: ContextBenchProcessRunner; +} + +export interface ContextBenchScoreResult { + status: 'completed' | 'judge_failed'; + mode: 'official_evaluator' | 'diagnostic_fallback'; + claimBearing: boolean; + command: string; + stdout: string; + stderr: string; + exitStatus: number | null; + fallbackReason?: string; +} + +export interface FactRecallDiagnosticResult { + missingRequiredFacts: string[]; + missingEvidenceFiles: string[]; + unsupportedClaim: boolean; + falseReady: boolean; + reasons: string[]; +} + +function writeJson(filePath: string, value: unknown): void { + mkdirSync(path.dirname(filePath), { recursive: true }); + writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8'); +} + +export async function scoreWithOfficialEvaluatorFirst( + params: OfficialEvaluatorParams +): Promise { + const args = [ + '-m', + 'contextbench.evaluate', + '--gold', + params.goldPath, + '--pred', + params.predictionPath + ]; + if (params.cachePath) args.push('--cache', params.cachePath); + args.push('--out', params.outputPath); + const command = `python ${args.join(' ')}`; + const result = await params.runner('python', args, params.cwd); + if (result.status === 0) { + const score = { + status: 'completed' as const, + mode: 'official_evaluator' as const, + claimBearing: true, + command, + stdout: result.stdout, + stderr: result.stderr, + exitStatus: result.status + }; + writeJson(params.outputPath, score); + return score; + } + + const score = { + status: 'judge_failed' as const, + mode: 'diagnostic_fallback' as const, + claimBearing: false, + command, + stdout: result.stdout, + stderr: result.stderr, + exitStatus: result.status, + fallbackReason: 'official_evaluator_failed' + }; + writeJson(params.outputPath, score); + return score; +} + +export function runFactRecallDiagnostics( + answer: ContextBenchStructuredAnswer, + expected: { requiredFacts?: string[]; requiredEvidenceFiles?: string[] } +): FactRecallDiagnosticResult { + const diagnostics = evaluateSchemaBoundDiagnostics(answer, expected); + const classification = classifyStructuredAnswer(answer, diagnostics); + return { + missingRequiredFacts: diagnostics.missingRequiredFacts ?? [], + missingEvidenceFiles: diagnostics.missingEvidenceFiles ?? [], + unsupportedClaim: classification.unsupportedClaim, + falseReady: classification.falseReady, + reasons: classification.reasons + }; +} diff --git a/src/eval/contextbench-trajectory.ts b/src/eval/contextbench-trajectory.ts new file mode 100644 index 0000000..8065bc8 --- /dev/null +++ b/src/eval/contextbench-trajectory.ts @@ -0,0 +1,77 @@ +import type { + ContextBenchPredSpan, + ContextBenchStructuredAnswer, + ContextBenchTaskIdentity, + ContextBenchTrajectoryRecord +} from './contextbench-types.js'; + +export interface NormalizeTrajectoryParams { + task: Pick; + answer: ContextBenchStructuredAnswer; + repoRoot?: string; + rawTraceSteps?: Array<{ files?: string[] }>; +} + +function normalizeSlashes(value: string): string { + return value.replace(/\\/g, '/'); +} + +export function normalizeContextBenchPath(filePath: string, repoRoot?: string): string { + let normalized = normalizeSlashes(filePath).replace(/^\.\//, ''); + if (repoRoot) { + const root = normalizeSlashes(repoRoot).replace(/\/$/, ''); + if (normalized.toLowerCase().startsWith(`${root.toLowerCase()}/`)) { + normalized = normalized.slice(root.length + 1); + } + } + return normalized.replace(/^\/+/, ''); +} + +function spanFromEvidence( + lineRange: ContextBenchStructuredAnswer['evidence'][number]['lineRange'] +): ContextBenchPredSpan { + return { start: lineRange.start, end: lineRange.end, full_file: false }; +} + +export function fullFileSpan(): ContextBenchPredSpan { + return { start: 1, end: null, full_file: true }; +} + +export function normalizeTrajectory( + params: NormalizeTrajectoryParams +): ContextBenchTrajectoryRecord { + const spans: Record = {}; + const files = new Set(); + + for (const evidence of params.answer.evidence) { + const file = normalizeContextBenchPath(evidence.file, params.repoRoot); + files.add(file); + spans[file] = [...(spans[file] ?? []), spanFromEvidence(evidence.lineRange)]; + } + + for (const fileRef of params.answer.filesReferenced) { + const file = normalizeContextBenchPath(fileRef, params.repoRoot); + if (file.length === 0) continue; + files.add(file); + if (!spans[file]) spans[file] = [fullFileSpan()]; + } + + const predFiles = [...files].sort(); + const traceFiles = (params.rawTraceSteps ?? []) + .flatMap((step) => step.files ?? []) + .map((file) => normalizeContextBenchPath(file, params.repoRoot)) + .filter((file) => file.length > 0); + const stepFiles = [...new Set([...traceFiles, ...predFiles])].sort(); + + return { + instance_id: params.task.instance_id, + repo_url: params.task.repo_url, + commit: params.task.base_commit, + traj_data: { + pred_steps: [{ files: stepFiles, spans }], + pred_files: predFiles, + pred_spans: spans + }, + model_patch: '' + }; +} diff --git a/src/eval/contextbench-types.ts b/src/eval/contextbench-types.ts new file mode 100644 index 0000000..515d234 --- /dev/null +++ b/src/eval/contextbench-types.ts @@ -0,0 +1,434 @@ +export type JsonPrimitive = string | number | boolean | null; +export type JsonValue = JsonPrimitive | JsonValue[] | { [key: string]: JsonValue }; + +export type JsonSchemaPrimitiveType = + | 'array' + | 'boolean' + | 'integer' + | 'null' + | 'number' + | 'object' + | 'string'; + +export interface JsonSchemaDefinition { + type?: JsonSchemaPrimitiveType | JsonSchemaPrimitiveType[]; + properties?: Record; + items?: JsonSchemaDefinition; + required?: string[]; + additionalProperties?: boolean | JsonSchemaDefinition; + enum?: JsonValue[]; + minLength?: number; + minimum?: number; +} + +export type ContextBenchTerminalStatus = + | 'completed' + | 'setup_failed' + | 'task_setup_failed' + | 'index_failed' + | 'timeout' + | 'invalid_schema' + | 'no_answer' + | 'wrong_answer' + | 'wrong_evidence' + | 'unsupported_claim' + | 'false_ready' + | 'tool_error' + | 'judge_failed'; + +export const CONTEXTBENCH_TERMINAL_STATUSES: readonly ContextBenchTerminalStatus[] = [ + 'completed', + 'setup_failed', + 'task_setup_failed', + 'index_failed', + 'timeout', + 'invalid_schema', + 'no_answer', + 'wrong_answer', + 'wrong_evidence', + 'unsupported_claim', + 'false_ready', + 'tool_error', + 'judge_failed' +]; + +export interface ContextBenchTaskIdentity { + instance_id: string; + original_inst_id: string; + source: string; + language: string; + repo: string; + repo_url: string; + base_commit: string; + problem_statement_ref: string; + problem_statement_hash: string; + gold_context_ref: string; + gold_context_hash: string; + patch_hash: string; + test_patch_hash: string; + f2p_hash: string; + p2p_hash: string; + gold_context_span_count: number; + hash_canonicalization_version: string; + hardness_signal_status: string; + hardness_signal_source: string; + hardness_proxy_used: boolean; + inclusion_rationale: string; + deterministic_rank: string; +} + +export interface ContextBenchTaskManifest { + name: string; + protocolVersion: string; + dataset: string; + datasetConfig: string; + split: string; + claimBearing: boolean; + selectedInPhase: number; + selection_algorithm: string; + selection_seed_or_deterministic_order: string; + selection_timestamp: string; + task_pool_hash: string; + exclusion_log_path: string; + hash_canonicalization_version: string; + evaluator_success_status: string; + hardness_signal_status: string; + hardness_signal_source: string; + hardness_proxy_used: boolean; + forbidden_selection_sources: string[]; + no_lane_outputs_observed_attestation: string; + tasks: ContextBenchTaskIdentity[]; + manifest_hash: string; +} + +export interface ContextBenchProtocol { + protocolVersion: string; + claimAllowed: boolean; + benchmarkTarget: { + officialEvaluatorFirst: boolean; + officialEvaluatorCommand: string; + fallbackScorerPolicy: { + claimBearing: boolean; + }; + }; + structuredAnswerSchema: { + requiredFields: string[]; + confidenceValues: string[]; + evidenceFields: string[]; + invalidSchemaStatus: 'invalid_schema'; + }; + budgets: { + setupAndIndexingReportedSeparately: boolean; + defaults: { + maxContextTokens: number; + maxAnswerTokens: number; + timeoutSeconds: number; + }; + }; + failureTaxonomy: ContextBenchTerminalStatus[]; + runManifestSchema: { + appendOnly: boolean; + requiredFields: string[]; + terminalStatuses: ContextBenchTerminalStatus[]; + failedRunsIncludedInAggregates: boolean; + }; +} + +export interface ContextBenchLane { + laneId: string; + displayName: string; + contextTool: string; + allowedTools: string[]; + disallowedTools: string[]; + nativeToolsAllowed: boolean; + setupCostReportedSeparately: boolean; + indexCostReportedSeparately: boolean; + cacheIsolationRequired: boolean; +} + +export interface ContextBenchLaneToolCard { + laneId: string; + displayName: string; + phase38Status: string; + phase39Status?: ContextBenchLaneReadinessStatus; + executableInPhase38: boolean; + contextTools: string[]; + allowedTools: string[]; + disallowedTools: string[]; + setupCommand: string; + indexCommand: string; + queryCommand: string; + versionCommand: string; + cachePath: string; + artifactPaths: { + setup: string; + rawTrace: string; + structuredAnswer: string; + trajectory: string; + score: string; + }; + setupCostReportedSeparately: boolean; + indexCostReportedSeparately: boolean; + claimBearing: boolean; +} + +export type ContextBenchLaneReadinessStatus = + | 'ready_for_phase40' + | 'setup_failed' + | 'index_failed' + | 'tool_error' + | 'invasive_setup_blocked' + | 'pending'; + +export const CONTEXTBENCH_LANE_READINESS_STATUSES: readonly ContextBenchLaneReadinessStatus[] = [ + 'ready_for_phase40', + 'setup_failed', + 'index_failed', + 'tool_error', + 'invasive_setup_blocked', + 'pending' +]; + +export type ContextBenchLaneCommandKind = 'setup' | 'index' | 'query' | 'version'; + +export interface ContextBenchLaneCommandEvidence { + kind: ContextBenchLaneCommandKind; + command: string; + cwd: string; + safeToRunAutomatically: boolean; + exitCode: number | null; + status: 'not_required' | 'not_run_documented' | 'succeeded' | 'failed' | 'blocked'; + durationMs: number | null; + stdoutLogPath: string | null; + stderrLogPath: string | null; + outputHash: string | null; +} + +export interface ContextBenchLaneSetupEvidenceRecord { + laneId: string; + readinessStatus: ContextBenchLaneReadinessStatus; + docsUrl: string; + sourceUrl: string; + workingDirectory: string; + platform: { + os: string; + shell: string; + runtime: string; + }; + redactedEnvVars: string[]; + commands: ContextBenchLaneCommandEvidence[]; + setupDurationMs: number | null; + indexDurationMs: number | null; + setupStatus: 'not_required' | 'ready' | 'failed' | 'blocked' | 'pending'; + indexStatus: 'not_required' | 'ready' | 'failed' | 'blocked' | 'pending'; + logReference: string | null; + evidenceHash: string; + nextHumanAction: string; + claimBearing: false; +} + +export interface ContextBenchLaneSetupEvidenceFixture { + name: string; + protocolVersion: string; + phase: 39; + claimBearing: false; + generatedOutputsPolicy: string; + records: ContextBenchLaneSetupEvidenceRecord[]; +} + +export type ContextBenchBaselineSlotStatus = 'reserved' | 'attempted' | 'terminal_missing_evidence'; + +export interface ContextBenchArtifactIndexEntry { + path: string; + hash: string; + bytes: number; +} + +export interface ContextBenchCommandTranscriptEntry { + command: string; + cwd: string; + exitCode: number | null; + stdoutPath: string | null; + stderrPath: string | null; + outputHash: string | null; +} + +export interface ContextBenchUntrackedSnapshotEntry { + path: string; + bytes: number | null; + mtimeMs: number | null; + hash: string | null; + disposition: 'hashed' | 'excluded'; + exclusionReason: string | null; +} + +export interface ContextBenchDirtyWorktreeSnapshot { + branch: string; + head: string; + divergence: { + status: 'unavailable' | 'available'; + reason: string; + }; + gitStatusPath: string; + trackedDiffPath: string; + stagedDiffPath: string; + diffStatPath: string; + untracked: ContextBenchUntrackedSnapshotEntry[]; + lockfiles: ContextBenchArtifactIndexEntry[]; + redactedEnvVarNames: string[]; + versions: Record; + fixtureHashes: Record; + commandTranscript: ContextBenchCommandTranscriptEntry[]; + snapshotHash: string; +} + +export interface ContextBenchBaselineSlotReservation { + laneId: string; + taskId: string; + repeatIndex: number; + status: ContextBenchBaselineSlotStatus; + terminalStatus: ContextBenchTerminalStatus | null; + reason: string | null; +} + +export interface ContextBenchBaselineSession { + sessionId: string; + phase: 40; + createdAt: string; + updatedAt: string; + sessionRoot: string; + claimBearing: false; + sealed: boolean; + snapshot: ContextBenchDirtyWorktreeSnapshot; + reservationsPath: string; + runManifestPath: string; + artifactIndex: ContextBenchArtifactIndexEntry[]; + sessionHash: string; +} + +export interface ContextBenchCodebaseContextBaselineArm { + baselineArmId: string; + laneId: 'codebase-context'; + sourceIdentity: string; + allowedToolSurfaces: string[]; + versionOrSourceRef: string; + setupCommand: string; + claimBearing: false; + failurePolicy: 'record_terminal_diagnostic_failure'; +} + +export interface ContextBenchCodebaseContextBaselineArmsFixture { + name: string; + protocolVersion: string; + phase: 40; + claimBearing: false; + denominatorPolicy: string; + arms: ContextBenchCodebaseContextBaselineArm[]; +} + +export interface ContextBenchEvidenceReference { + file: string; + lineRange: { + start: number; + end: number; + }; + reason: string; +} + +export type ContextBenchConfidence = 'low' | 'medium' | 'high'; + +export interface ContextBenchStructuredAnswer { + answer: JsonValue; + confidence: ContextBenchConfidence; + evidence: ContextBenchEvidenceReference[]; + filesReferenced: string[]; + symbolsReferenced: string[]; + unsupportedClaims: string[]; + readyToEdit: boolean; +} + +export interface ContextBenchSetupIndexMetadata { + setupCommand: string; + indexCommand: string; + setupDurationMs: number; + indexDurationMs: number; + setupLogPath: string; + indexLogPath: string; + setupStatus: 'not_required' | 'completed' | 'setup_failed'; + indexStatus: 'not_required' | 'completed' | 'index_failed'; + taskMaterializationStatus?: 'not_required' | 'completed' | 'failed'; + taskMaterializationErrors?: string[]; +} + +export type ContextBenchExecutor = 'fake' | 'claude' | 'codex' | 'gemini' | 'opencode'; + +export interface ContextBenchTaskExecutionMetadata { + model: string; + timeoutSeconds: number; + maxContextTokens: number; + maxAnswerTokens: number; + startedAt: string; + completedAt: string; + taskWallTimeMs: number; + executor: ContextBenchExecutor; +} + +export interface ContextBenchScoringMetadata { + officialEvaluatorFirst: boolean; + officialEvaluatorAttempted?: boolean; + officialEvaluatorInvoked?: boolean; + command: string; + claimBearing: boolean; + fallbackReason?: string; + stdoutPath?: string; + stderrPath?: string; +} + +export interface ContextBenchRunManifestRow { + run_id: string; + protocol_version: string; + protocol_hash: string; + task_manifest_hash: string; + lane_id: string; + task_id: string; + repeat_index: number; + status: ContextBenchTerminalStatus; + started_at: string; + completed_at: string; + raw_trace_path: string; + structured_answer_path: string; + trajectory_path: string; + score_path: string; + setup_index_path: string; + prompt_path: string; + lane_tool_card_path: string; + setupIndex: ContextBenchSetupIndexMetadata; + taskExecution: ContextBenchTaskExecutionMetadata; + scoring: ContextBenchScoringMetadata; + hashes: Record; +} + +export interface ContextBenchPredSpan { + start: number; + end: number | null; + full_file: boolean; +} + +export interface ContextBenchTrajectoryRecord { + instance_id: string; + repo_url: string; + commit: string; + traj_data: { + pred_steps: Array<{ + files: string[]; + spans: Record; + }>; + pred_files: string[]; + pred_spans: Record; + }; + model_patch: string; +} + +export function isContextBenchTerminalStatus(value: string): value is ContextBenchTerminalStatus { + return CONTEXTBENCH_TERMINAL_STATUSES.includes(value as ContextBenchTerminalStatus); +} diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts new file mode 100644 index 0000000..41436fd --- /dev/null +++ b/tests/contextbench-baseline-runner.test.ts @@ -0,0 +1,1095 @@ +import { execFileSync, spawnSync } from 'node:child_process'; +import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { describe, expect, it, vi } from 'vitest'; +import manifestFixture from './fixtures/contextbench-task-manifest.json'; + +type ManifestRow = { + run_id: string; + lane_id: string; + task_id: string; + repeat_index: number; + status: string; + raw_trace_path: string; + setupIndex: { + setupStatus: string; + indexStatus: string; + setupDurationMs?: number; + indexDurationMs?: number; + setupLogPath?: string; + indexLogPath?: string; + taskWallTimeMs?: number; + }; + taskExecution: { executor: string; taskWallTimeMs: number }; + hashes: { runnerSourceHash?: string }; + scoring: { + claimBearing: boolean; + fallbackReason?: string; + officialEvaluatorFirst?: boolean; + officialEvaluatorAttempted?: boolean; + officialEvaluatorInvoked?: boolean; + stdoutPath?: string; + stderrPath?: string; + }; +}; + +type TaskManifest = { tasks: Array<{ instance_id: string }> }; + +const manifest = manifestFixture as TaskManifest; +vi.setConfig({ testTimeout: 30000 }); + +function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { + return path.join( + mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-runner-`)), + 'benchmark-runs', + 'contextbench', + phase, + 'runner-smoke' + ); +} + +function readRows(sessionRoot: string): ManifestRow[] { + return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as ManifestRow); +} + +function createCleanGitRepo(root: string): string { + const repoPath = path.join(root, 'repo'); + mkdirSync(repoPath, { recursive: true }); + writeFileSync(path.join(repoPath, 'README.md'), '# ContextBench fixture\n', 'utf8'); + execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' }); + execFileSync('git', ['add', 'README.md'], { cwd: repoPath, encoding: 'utf8' }); + execFileSync( + 'git', + ['-c', 'user.name=ContextBench Test', '-c', 'user.email=contextbench@example.invalid', 'commit', '-m', 'fixture'], + { cwd: repoPath, encoding: 'utf8' } + ); + return repoPath; +} + +function writePayloadFile(root: string, taskId: string, repoCheckoutPath: string): string { + const payloadPath = path.join(root, 'TASK-PAYLOADS.json'); + writeFileSync( + payloadPath, + JSON.stringify( + { + tasksById: { + [taskId]: { + problem_statement: 'Use the fixture repository to answer with cited evidence.', + repo_checkout_path: repoCheckoutPath + } + } + }, + null, + 2 + ), + 'utf8' + ); + return payloadPath; +} + +function writeStubClaude(root: string): string { + const stubPath = path.join(root, 'stub-claude.cjs'); + writeFileSync( + stubPath, + `const answer = { type: 'result', structured_output: { answer: 'fixture answer', confidence: 'medium', evidence: [{ file: 'README.md', lineRange: { start: 1, end: 1 }, reason: 'fixture evidence' }], filesReferenced: ['README.md'], symbolsReferenced: [], unsupportedClaims: [], readyToEdit: false } }; process.stdout.write(JSON.stringify(answer));`, + 'utf8' + ); + return stubPath; +} + +function writeStubEvaluator(root: string, exitCode: 0 | 1, output = JSON.stringify({ score: 1 })): string { + const stubPath = path.join(root, `stub-evaluator-${exitCode}.cjs`); + const serializedOutput = JSON.stringify(output); + writeFileSync( + stubPath, + `const fs = require('node:fs'); const predIndex = process.argv.indexOf('--pred'); if (predIndex < 0 || !fs.existsSync(process.argv[predIndex + 1])) { process.stderr.write('missing prediction artifact'); process.exit(2); } const outIndex = process.argv.indexOf('--out'); if (outIndex >= 0 && process.argv[outIndex + 1] && ${exitCode} === 0) fs.writeFileSync(process.argv[outIndex + 1], ${serializedOutput} + '\\n'); process.stdout.write('official evaluator stub'); process.exit(${exitCode});`, + 'utf8' + ); + return stubPath; +} + +describe('ContextBench Phase 40 baseline runner', () => { + it('reserves every required slot and writes terminal missing-evidence rows for blocked lanes', () => { + const sessionRoot = tempSessionRoot(); + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + const reservations = JSON.parse( + readFileSync(path.join(sessionRoot, 'slot-reservations.json'), 'utf8') + ) as { reservations: Array<{ laneId: string; status: string; terminalStatus: string | null }> }; + expect(reservations.reservations).toHaveLength(20 * 6 * 3); + const blocked = reservations.reservations.filter((slot) => slot.status === 'terminal_missing_evidence'); + expect(blocked).toHaveLength(20 * 2 * 3); + expect([...new Set(blocked.map((slot) => slot.laneId))].sort()).toEqual([ + 'codebase-memory-mcp', + 'grepai' + ]); + expect(blocked.every((slot) => slot.terminalStatus === 'setup_failed')).toBe(true); + + const rows = readRows(sessionRoot); + expect(rows.filter((row) => row.status === 'setup_failed')).toHaveLength(blocked.length); + expect(rows.every((row) => row.scoring.claimBearing === false)).toBe(true); + expect(rows.every((row) => row.scoring.officialEvaluatorFirst === false)).toBe(true); + expect(rows.every((row) => row.scoring.officialEvaluatorAttempted === false)).toBe(true); + expect(rows.every((row) => row.scoring.officialEvaluatorInvoked === false)).toBe(true); + expect(rows.every((row) => !('taskWallTimeMs' in row.setupIndex))).toBe(true); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8' } + ); + const validateOutput = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { encoding: 'utf8' } + ); + expect(validateOutput).toContain('baseline session validation passed'); + const rows = readRows(sessionRoot); + const attempt = rows.find( + (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt).toBeTruthy(); + expect(attempt).toMatchObject({ status: 'completed', lane_id: 'raw-native' }); + expect(attempt?.taskExecution.executor).toBe('fake'); + expect(attempt?.setupIndex.setupStatus).toBe('not_required'); + expect(attempt?.scoring).toMatchObject({ + claimBearing: false, + officialEvaluatorFirst: false, + officialEvaluatorAttempted: false, + officialEvaluatorInvoked: false + }); + const rawTrace = JSON.parse(readFileSync(attempt?.raw_trace_path ?? '', 'utf8')) as { + runnerHash?: string; + laneIsolation?: { proven: boolean; proofSource: string; observedTools: string[] }; + scriptedAgentDecisions: boolean; + antiScriptingBoundary: string[]; + }; + expect(rawTrace.runnerHash).toMatch(/^sha256:[a-f0-9]{64}$/); + expect(attempt?.hashes.runnerSourceHash).toBe(rawTrace.runnerHash); + expect(rawTrace.laneIsolation).toMatchObject({ + proven: false, + proofSource: 'not_captured', + observedTools: [] + }); + expect(rawTrace.scriptedAgentDecisions).toBe(false); + expect(rawTrace.antiScriptingBoundary).toEqual(expect.arrayContaining(['file_selection'])); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('records official evaluator invocation metadata for overridden live executor attempts', () => { + const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-')); + const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke'); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createCleanGitRepo(tempRoot); + const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); + const stubClaude = writeStubClaude(tempRoot); + const stubEvaluator = writeStubEvaluator(tempRoot, 0); + const env = { + ...process.env, + CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), + CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), + CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ + 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } + }) + }; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8', + env + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--task-payloads', + payloadPath, + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8', env } + ); + const rows = readRows(sessionRoot); + const attempt = rows.find( + (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('completed'); + expect(attempt?.taskExecution.executor).toBe('claude'); + expect(attempt?.scoring).toMatchObject({ + claimBearing: false, + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true + }); + expect(attempt?.scoring.command).toContain('--out'); + const score = JSON.parse(readFileSync(attempt?.score_path ?? '', 'utf8')) as { + mode: string; + exitCode?: number; + outputHash?: string; + stdoutPath?: string; + stderrPath?: string; + }; + expect(score.mode).toBe('official_evaluator'); + expect(score.exitCode).toBe(0); + expect(score.outputHash).toMatch(/^sha256:[a-f0-9]{64}$/); + expect(score.stdoutPath).toBeTruthy(); + expect(score.stderrPath).toBeTruthy(); + const rawTrace = JSON.parse(readFileSync(attempt?.raw_trace_path ?? '', 'utf8')) as { + laneIsolation?: { proven: boolean; proofSource: string; observedTools: string[] }; + }; + expect(rawTrace.laneIsolation).toMatchObject({ + proven: true, + proofSource: 'stubbed_test_proxy', + observedTools: ['native-read'] + }); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + }); + + it('rejects malformed official evaluator output as judge_failed diagnostic evidence', () => { + const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-malformed-')); + const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke'); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createCleanGitRepo(tempRoot); + const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); + const stubClaude = writeStubClaude(tempRoot); + const stubEvaluator = writeStubEvaluator(tempRoot, 0, 'not json'); + const env = { + ...process.env, + CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), + CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), + CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ + 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } + }) + }; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8', + env + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--task-payloads', + payloadPath, + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8', env } + ); + const attempt = readRows(sessionRoot).find( + (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('judge_failed'); + expect(attempt?.scoring).toMatchObject({ + claimBearing: false, + fallbackReason: 'official_evaluator_malformed_jsonl', + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true + }); + const score = JSON.parse(readFileSync(attempt?.score_path ?? '', 'utf8')) as { + mode: string; + fallbackReason: string; + outputHash?: string; + }; + expect(score.mode).toBe('diagnostic_fallback'); + expect(score.fallbackReason).toBe('official_evaluator_malformed_jsonl'); + expect(score.outputHash).toMatch(/^sha256:[a-f0-9]{64}$/); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + }); + + it('rejects non-object or wrong-task official evaluator JSONL as diagnostic evidence', () => { + const cases = [ + { output: '1', reason: 'official_evaluator_non_object_jsonl' }, + { + output: JSON.stringify({ instance_id: 'wrong-task-id', score: 1 }), + reason: 'official_evaluator_task_mismatch' + } + ]; + + for (const testCase of cases) { + const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-envelope-')); + const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke'); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createCleanGitRepo(tempRoot); + const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); + const stubClaude = writeStubClaude(tempRoot); + const stubEvaluator = writeStubEvaluator(tempRoot, 0, testCase.output); + const env = { + ...process.env, + CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), + CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), + CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ + 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } + }) + }; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8', + env + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--task-payloads', + payloadPath, + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8', env } + ); + const attempt = readRows(sessionRoot).find( + (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('judge_failed'); + expect(attempt?.scoring.fallbackReason).toBe(testCase.reason); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + } + }); + + it('records official evaluator failure as judge_failed without making claims', () => { + const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-fail-')); + const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke'); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createCleanGitRepo(tempRoot); + const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); + const stubClaude = writeStubClaude(tempRoot); + const stubEvaluator = writeStubEvaluator(tempRoot, 1); + const env = { + ...process.env, + CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), + CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]) + }; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8', + env + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--task-payloads', + payloadPath, + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8', env } + ); + const rows = readRows(sessionRoot); + const attempt = rows.find( + (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('judge_failed'); + expect(attempt?.scoring).toMatchObject({ + claimBearing: false, + fallbackReason: 'official_evaluator_missing_output', + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true + }); + const score = JSON.parse(readFileSync(attempt?.score_path ?? '', 'utf8')) as { + mode: string; + claimBearing: boolean; + exitCode: number; + exitStatus: number; + }; + expect(score.mode).toBe('diagnostic_fallback'); + expect(score.claimBearing).toBe(false); + expect(score.exitCode).toBe(1); + expect(score.exitStatus).toBe(1); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + }); + + it('chunks all-ready-lane execution with max-attempts so live runs are resumable', () => { + const sessionRoot = tempSessionRoot('phase41'); + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--all-ready-lanes', + '--repeats', + '3', + '--max-attempts', + '2' + ], + { encoding: 'utf8' } + ); + const rows = readRows(sessionRoot); + const attemptedRows = rows.filter( + (row) => row.status === 'completed' && row.taskExecution.executor === 'fake' + ); + expect(attemptedRows).toHaveLength(2); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--all-ready-lanes', + '--repeats', + '3', + '--max-attempts', + '2' + ], + { encoding: 'utf8' } + ); + const resumedRows = readRows(sessionRoot).filter( + (row) => row.status === 'completed' && row.taskExecution.executor === 'fake' + ); + expect(resumedRows).toHaveLength(4); + const session = JSON.parse( + readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8') + ) as { phase: number }; + expect(session.phase).toBe(41); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('measures raw-native setup/index as a session artifact and reuses it in attempt rows', () => { + const sessionRoot = tempSessionRoot('phase41'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--setup-index-measure', + '--session', + sessionRoot, + '--lane', + 'raw-native' + ], + { encoding: 'utf8' } + ); + const measurement = JSON.parse( + readFileSync(path.join(sessionRoot, 'setup-index', 'raw-native', 'setup-index.json'), 'utf8') + ) as { claimBearing: boolean; setupStatus: string; indexStatus: string; setupLogPath: string }; + expect(measurement).toMatchObject({ + claimBearing: false, + setupStatus: 'not_required', + indexStatus: 'not_required' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8' } + ); + const attempt = readRows(sessionRoot).find( + (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('completed'); + expect(attempt?.setupIndex).toMatchObject({ + setupStatus: 'not_required', + indexStatus: 'not_required', + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: measurement.setupLogPath + }); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('fails a ready non-raw lane closed when setup/index measurement is missing', () => { + const sessionRoot = tempSessionRoot('phase41'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--lane', + 'codebase-context', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8' } + ); + const attempt = readRows(sessionRoot).find( + (row) => row.lane_id === 'codebase-context' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('setup_failed'); + expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement'); + expect(attempt?.setupIndex.setupStatus).toBe('setup_failed'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('imports setup/index evidence for ready non-raw lanes before task execution', () => { + const sessionRoot = tempSessionRoot('phase41'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + const logsDir = path.join(sessionRoot, 'manual-setup-index-logs', 'codebase-context'); + mkdirSync(logsDir, { recursive: true }); + const setupLogPath = path.join(logsDir, 'setup.stdout.log'); + const indexLogPath = path.join(logsDir, 'index.stdout.log'); + writeFileSync(setupLogPath, 'setup completed\n', 'utf8'); + writeFileSync(indexLogPath, 'index completed\n', 'utf8'); + const importPath = path.join(path.dirname(sessionRoot), 'codebase-context-setup-index-import.json'); + writeFileSync( + importPath, + JSON.stringify( + { + laneId: 'codebase-context', + claimBearing: false, + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 12, + indexDurationMs: 34, + setupLogPath, + indexLogPath + }, + null, + 2 + ), + 'utf8' + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--setup-index-import', + '--session', + sessionRoot, + '--lane', + 'codebase-context', + '--input', + importPath + ], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--lane', + 'codebase-context', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8' } + ); + const attempt = readRows(sessionRoot).find( + (row) => row.lane_id === 'codebase-context' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('completed'); + expect(attempt?.setupIndex).toMatchObject({ + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 12, + indexDurationMs: 34, + setupLogPath, + indexLogPath + }); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('rejects forged or failed setup/index imports before non-raw task execution', () => { + const sessionRoot = tempSessionRoot('phase41'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + const logsDir = path.join(sessionRoot, 'manual-setup-index-logs', 'codebase-context'); + mkdirSync(logsDir, { recursive: true }); + const setupLogPath = path.join(logsDir, 'setup.stdout.log'); + const indexLogPath = path.join(logsDir, 'index.stdout.log'); + writeFileSync(setupLogPath, 'setup failed\n', 'utf8'); + writeFileSync(indexLogPath, 'index skipped\n', 'utf8'); + const wrongLaneImport = path.join(path.dirname(sessionRoot), 'wrong-lane-import.json'); + writeFileSync( + wrongLaneImport, + JSON.stringify({ + laneId: 'raw-native', + claimBearing: false, + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 1, + indexDurationMs: 1, + setupLogPath, + indexLogPath + }), + 'utf8' + ); + const wrongLane = spawnSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--setup-index-import', + '--session', + sessionRoot, + '--lane', + 'codebase-context', + '--input', + wrongLaneImport + ], + { encoding: 'utf8' } + ); + expect(wrongLane.status).toBe(1); + expect(wrongLane.stderr).toContain('laneId mismatch'); + + const siblingDir = `${sessionRoot}-evil`; + mkdirSync(siblingDir, { recursive: true }); + const siblingSetupLog = path.join(siblingDir, 'setup.stdout.log'); + const siblingIndexLog = path.join(siblingDir, 'index.stdout.log'); + writeFileSync(siblingSetupLog, 'setup forged\n', 'utf8'); + writeFileSync(siblingIndexLog, 'index forged\n', 'utf8'); + const outsideImport = path.join(path.dirname(sessionRoot), 'outside-import.json'); + writeFileSync( + outsideImport, + JSON.stringify({ + laneId: 'codebase-context', + claimBearing: false, + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 1, + indexDurationMs: 1, + setupLogPath: siblingSetupLog, + indexLogPath: siblingIndexLog + }), + 'utf8' + ); + const outside = spawnSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--setup-index-import', + '--session', + sessionRoot, + '--lane', + 'codebase-context', + '--input', + outsideImport + ], + { encoding: 'utf8' } + ); + expect(outside.status).toBe(1); + expect(outside.stderr).toContain('inside session root'); + + const failedImport = path.join(path.dirname(sessionRoot), 'failed-import.json'); + writeFileSync( + failedImport, + JSON.stringify({ + laneId: 'codebase-context', + claimBearing: false, + setupStatus: 'setup_failed', + indexStatus: 'not_required', + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath, + indexLogPath + }), + 'utf8' + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--setup-index-import', + '--session', + sessionRoot, + '--lane', + 'codebase-context', + '--input', + failedImport + ], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--lane', + 'codebase-context', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8' } + ); + const attempt = readRows(sessionRoot).find( + (row) => row.lane_id === 'codebase-context' && row.task_id === taskId && row.repeat_index === 1 + ); + expect(attempt?.status).toBe('setup_failed'); + expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('validates diagnostic codebase-context baseline arms as non-claim-bearing side evidence', () => { + const output = execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-validate-arms', + 'tests/fixtures/contextbench-codebase-context-baseline-arms.json' + ], + { encoding: 'utf8' } + ); + expect(output).toContain('baseline arm validation passed'); + }); + + it('can record diagnostic codebase-context arm smoke rows separate from required reservations', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run-codebase-context-arms', + '--session', + sessionRoot, + '--executor', + 'fake', + '--task-id', + taskId, + '--repeats', + '1' + ], + { encoding: 'utf8' } + ); + const rows = readRows(sessionRoot); + const diagnosticRows = rows.filter((row) => row.run_id.startsWith('codebase-context-current')); + expect(diagnosticRows.length).toBeGreaterThanOrEqual(3); + expect(diagnosticRows.every((row) => row.lane_id === 'codebase-context')).toBe(true); + expect(diagnosticRows.every((row) => row.scoring.claimBearing === false)).toBe(true); + const reservations = JSON.parse( + readFileSync(path.join(sessionRoot, 'slot-reservations.json'), 'utf8') + ) as { reservations: unknown[] }; + expect(reservations.reservations).toHaveLength(20 * 6 * 3); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('runs Phase 42 verification as read-only artifact-derived evidence and fails diagnostic sessions closed', () => { + const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-phase42-verify-')); + const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke'); + const reportPath = path.join(tempRoot, 'phase42-report.json'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1' + ], + { encoding: 'utf8' } + ); + const sessionBefore = readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8'); + const result = spawnSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--phase42-verify', + '--session', + sessionRoot, + '--out', + reportPath, + '--quiet' + ], + { encoding: 'utf8' } + ); + expect(result.status).toBe(1); + expect(result.stdout).toContain('phase42 verification failed'); + expect(result.stderr).toContain('phase42 verification failed'); + const report = JSON.parse(readFileSync(reportPath, 'utf8')) as { + claimPass: boolean; + diagnosticOnly: boolean; + protocolClaimAllowed: boolean; + expectedTotalRows: number; + requiredRows: number; + supplementalRows: number; + failureCounts: Record; + blockedClaims: string[]; + }; + expect(report).toMatchObject({ + claimPass: false, + diagnosticOnly: true, + protocolClaimAllowed: false, + expectedTotalRows: 20 * 6 * 3, + requiredRows: 20 * 2 * 3 + 1, + supplementalRows: 0 + }); + expect(report.failureCounts.protocol_claims_disabled).toBe(1); + expect(report.failureCounts.denominator_count_mismatch).toBe(1); + expect(report.failureCounts.official_evaluator_missing).toBeGreaterThan(0); + expect(report.failureCounts.missing_required_run).toBeGreaterThan(0); + expect(report.blockedClaims).toContain('Phase 42 passed'); + expect(readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8')).toBe(sessionBefore); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + }); + + it('classifies diagnostic baseline arms as supplemental during Phase 42 verification', () => { + const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-phase42-arms-')); + const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke'); + const reportPath = path.join(tempRoot, 'phase42-report.json'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run-codebase-context-arms', + '--session', + sessionRoot, + '--executor', + 'fake', + '--task-id', + taskId, + '--repeats', + '1' + ], + { encoding: 'utf8' } + ); + const result = spawnSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--phase42-verify', + '--session', + sessionRoot, + '--out', + reportPath + ], + { encoding: 'utf8' } + ); + expect(result.status).toBe(1); + const report = JSON.parse(readFileSync(reportPath, 'utf8')) as { + requiredRows: number; + supplementalRows: number; + failureCounts: Record; + }; + expect(report.requiredRows).toBe(20 * 2 * 3); + expect(report.supplementalRows).toBeGreaterThanOrEqual(3); + expect(report.failureCounts.unexpected_run_row ?? 0).toBe(0); + expect(report.failureCounts.denominator_count_mismatch).toBe(1); + } finally { + rmSync(tempRoot, { recursive: true, force: true }); + } + }); + + it('blocks baseline seal when terminal row completeness lacks Phase 42 claim evidence', () => { + const sessionRoot = tempSessionRoot('phase41'); + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--all-ready-lanes', + '--repeats', + '3' + ], + { encoding: 'utf8' } + ); + expect(readRows(sessionRoot)).toHaveLength(20 * 6 * 3); + + const result = spawnSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-seal', '--session', sessionRoot], + { encoding: 'utf8' } + ); + + expect(result.status).toBe(1); + expect(result.stdout).toContain('baseline session validation passed'); + expect(result.stdout).toContain('phase42 verification failed'); + expect(result.stderr).toContain('baseline seal blocked by Phase 42 evidence gate'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); +}); diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts new file mode 100644 index 0000000..a1b808d --- /dev/null +++ b/tests/contextbench-baseline-schema-gate.test.ts @@ -0,0 +1,944 @@ +import { execFileSync } from 'node:child_process'; +import { chmodSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { describe, expect, it, vi } from 'vitest'; +import { + CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA, + CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS, + parseStructuredAnswer +} from '../src/eval/contextbench-answer.js'; +import manifestFixture from './fixtures/contextbench-task-manifest.json'; + +type ManifestRow = { + run_id: string; + status: string; + raw_trace_path: string; + structured_answer_path: string; + trajectory_path: string; + scoring: { claimBearing: boolean }; +}; + +type TaskManifest = { tasks: Array<{ instance_id: string; base_commit: string }> }; + +const manifest = manifestFixture as TaskManifest; +vi.setConfig({ testTimeout: 30000 }); + +function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { + return path.join( + mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)), + 'benchmark-runs', + 'contextbench', + phase, + 'schema-gate-smoke' + ); +} + +function readRows(sessionRoot: string): ManifestRow[] { + return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as ManifestRow); +} + +function createClaudeStub( + stdout: string, + capture?: { cwdPath?: string; stdinPath?: string } +): { stubDir: string; env: NodeJS.ProcessEnv } { + const stubDir = mkdtempSync(path.join(tmpdir(), 'contextbench-claude-stub-')); + const stubScript = path.join(stubDir, 'claude-stub.cjs'); + writeFileSync( + stubScript, + [ + "const fs = require('node:fs');", + "if (process.env.CLAUDE_STUB_CWD_PATH) fs.writeFileSync(process.env.CLAUDE_STUB_CWD_PATH, process.cwd(), 'utf8');", + "let stdin = '';", + "process.stdin.setEncoding('utf8');", + "process.stdin.on('data', (chunk) => { stdin += chunk; });", + "process.stdin.on('end', () => {", + " if (process.env.CLAUDE_STUB_STDIN_PATH) fs.writeFileSync(process.env.CLAUDE_STUB_STDIN_PATH, stdin, 'utf8');", + " process.stdout.write(process.env.CLAUDE_STUB_STDOUT || '');", + '});', + 'process.stdin.resume();' + ].join('\n'), + 'utf8' + ); + writeFileSync( + path.join(stubDir, 'claude.cmd'), + '@echo off\r\nnode "%~dp0claude-stub.cjs"\r\n', + 'utf8' + ); + const shellStub = path.join(stubDir, 'claude'); + writeFileSync(shellStub, '#!/bin/sh\nnode "$(dirname "$0")/claude-stub.cjs"\n', 'utf8'); + chmodSync(shellStub, 0o755); + return { + stubDir, + env: { + ...process.env, + PATH: `${stubDir}${path.delimiter}${process.env.PATH ?? ''}`, + Path: `${stubDir}${path.delimiter}${process.env.Path ?? process.env.PATH ?? ''}`, + CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubScript]), + CLAUDE_STUB_STDOUT: stdout, + CLAUDE_STUB_CWD_PATH: capture?.cwdPath, + CLAUDE_STUB_STDIN_PATH: capture?.stdinPath + } + }; +} + +function writeTaskPayloads( + filePath: string, + taskId: string, + payload: Record +): void { + writeFileSync( + filePath, + `${JSON.stringify({ tasks: [{ instance_id: taskId, ...payload }] }, null, 2)}\n`, + 'utf8' + ); +} + +function createGitCheckout(): string { + const repoPath = mkdtempSync(path.join(tmpdir(), 'contextbench-task-repo-')); + execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' }); + execFileSync( + 'git', + [ + '-c', + 'user.name=ContextBench Test', + '-c', + 'user.email=contextbench@example.invalid', + 'commit', + '--allow-empty', + '-m', + 'init' + ], + { cwd: repoPath, encoding: 'utf8' } + ); + return repoPath; +} + +function structuredStubAnswer(): Record { + return { + answer: { adapterSmoke: true }, + confidence: 'medium', + evidence: [ + { file: 'README.md', lineRange: { start: 1, end: 1 }, reason: 'stubbed adapter evidence' } + ], + filesReferenced: ['README.md'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }; +} + +function createAdapterStub( + executor: 'codex' | 'gemini' | 'opencode', + capture?: { cwdPath?: string; argsPath?: string } +): { stubDir: string; env: NodeJS.ProcessEnv } { + const stubDir = mkdtempSync(path.join(tmpdir(), `contextbench-${executor}-stub-`)); + const stubScript = path.join(stubDir, `${executor}-stub.cjs`); + writeFileSync( + stubScript, + [ + "const fs = require('node:fs');", + 'const executor = process.env.ADAPTER_STUB_EXECUTOR;', + 'const args = process.argv.slice(2);', + "if (process.env.ADAPTER_STUB_CWD_PATH) fs.writeFileSync(process.env.ADAPTER_STUB_CWD_PATH, process.cwd(), 'utf8');", + "if (process.env.ADAPTER_STUB_ARGS_PATH) fs.writeFileSync(process.env.ADAPTER_STUB_ARGS_PATH, JSON.stringify(args), 'utf8');", + `const answer = ${JSON.stringify(JSON.stringify(structuredStubAnswer()))};`, + "if (executor === 'codex') {", + " const outputIndex = args.indexOf('--output-last-message');", + " if (outputIndex >= 0) fs.writeFileSync(args[outputIndex + 1], answer, 'utf8');", + " process.stdout.write(JSON.stringify({ type: 'done' }) + '\\n');", + "} else if (executor === 'gemini') {", + ' process.stdout.write(JSON.stringify({ response: answer }));', + "} else if (executor === 'opencode') {", + " process.stdout.write(JSON.stringify({ type: 'text', part: { type: 'text', text: answer } }) + '\\n');", + '} else {', + " process.stderr.write('unknown adapter stub executor');", + ' process.exitCode = 2;', + '}' + ].join('\n'), + 'utf8' + ); + return { + stubDir, + env: { + ...process.env, + [`CONTEXTBENCH_${executor.toUpperCase()}_COMMAND`]: JSON.stringify([ + process.execPath, + stubScript + ]), + ADAPTER_STUB_EXECUTOR: executor, + ADAPTER_STUB_CWD_PATH: capture?.cwdPath, + ADAPTER_STUB_ARGS_PATH: capture?.argsPath + } + }; +} + +describe('ContextBench Phase 40 schema gate', () => { + it('exports the structured answer schema used to constrain live Claude output', () => { + expect(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA).toMatchObject({ + type: 'object', + additionalProperties: false, + required: [...CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS] + }); + expect(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties?.confidence).toMatchObject({ + type: 'string', + enum: ['low', 'medium', 'high'] + }); + expect(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties?.evidence).toMatchObject({ + type: 'array' + }); + const evidenceSchema = CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties?.evidence; + const evidenceItems = Array.isArray(evidenceSchema?.items) + ? evidenceSchema.items[0] + : evidenceSchema?.items; + expect(evidenceItems).toMatchObject({ additionalProperties: false }); + expect(evidenceItems?.properties?.lineRange).toMatchObject({ additionalProperties: false }); + }); + + it('passes the shared schema through Claude CLI arguments without running a live call', () => { + const output = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--print-claude-args', '--model', 'haiku'], + { encoding: 'utf8' } + ); + const args = JSON.parse(output) as string[]; + const schemaIndex = args.indexOf('--json-schema'); + expect(args).toEqual( + expect.arrayContaining([ + '--print', + '--output-format', + 'json', + '--model', + 'haiku', + '--json-schema' + ]) + ); + expect(schemaIndex).toBeGreaterThan(-1); + const schema = JSON.parse(args[schemaIndex + 1] ?? '{}') as { + required?: string[]; + properties?: Record; + }; + expect(schema.required).toEqual([...CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS]); + expect(schema.properties).toHaveProperty('readyToEdit'); + }); + + it('keeps invalid structured output terminal instead of repairing prose into success', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { + encoding: 'utf8' + } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'fake', + '--fake-answer-mode', + 'invalid_schema', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1', + '--max-attempts', + '1' + ], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { + encoding: 'utf8' + } + ); + + const row = readRows(sessionRoot).find( + (candidate) => + candidate.status === 'invalid_schema' && candidate.scoring.claimBearing === false + ); + expect(row).toBeTruthy(); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + structuredAnswerParseErrors: string[]; + }; + expect(rawTrace.structuredAnswerParseErrors).toContain('invalid_json'); + const fallbackAnswer = JSON.parse( + readFileSync(row?.structured_answer_path ?? '', 'utf8') + ) as { + unsupportedClaims: string[]; + }; + expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('accepts Claude JSON envelope structured_output without a paid live call', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createGitCheckout(); + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const payloadPath = path.join(payloadDir, 'task-payloads.json'); + const cwdCapturePath = path.join(payloadDir, 'claude-cwd.txt'); + const stdinCapturePath = path.join(payloadDir, 'claude-stdin.txt'); + const answer = { + answer: 'ok', + confidence: 'medium', + evidence: [{ file: 'src/a.ts', lineRange: { start: 1, end: 1 }, reason: 'stubbed evidence' }], + filesReferenced: ['src/a.ts'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }; + writeTaskPayloads(payloadPath, taskId, { + problem_statement: 'Fix the failing ContextBench task without using hidden gold context.', + repo_checkout_path: repoPath + }); + const { stubDir, env } = createClaudeStub( + JSON.stringify({ + type: 'result', + subtype: 'success', + is_error: false, + structured_output: answer + }), + { cwdPath: cwdCapturePath, stdinPath: stdinCapturePath } + ); + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { + encoding: 'utf8' + } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1', + '--task-payloads', + payloadPath, + '--max-attempts', + '1' + ], + { encoding: 'utf8', env } + ); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { + encoding: 'utf8' + } + ); + const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude')); + expect(row).toMatchObject({ status: 'completed' }); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + structuredAnswerParseErrors: string[]; + claudeArgs: string[]; + }; + expect(rawTrace.structuredAnswerParseErrors).toEqual([]); + expect(rawTrace.claudeArgs).toEqual(expect.arrayContaining(['--output-format', 'json'])); + expect(rawTrace.workingDirectory).toBe(repoPath); + expect(rawTrace.taskContext).toMatchObject({ + materialized: true, + repoCheckoutPath: repoPath, + verificationStrict: false + }); + expect(readFileSync(cwdCapturePath, 'utf8')).toBe(repoPath); + const stdin = readFileSync(stdinCapturePath, 'utf8'); + expect(stdin).toContain('Problem statement:'); + expect(stdin).toContain('Fix the failing ContextBench task'); + expect(stdin).not.toContain('dataset_field:problem_statement'); + const structuredAnswer = JSON.parse( + readFileSync(row?.structured_answer_path ?? '', 'utf8') + ) as { + answer: string; + }; + expect(structuredAnswer.answer).toBe('ok'); + const trajectory = JSON.parse(readFileSync(row?.trajectory_path ?? '', 'utf8')) as { + traj_data: { pred_files: string[] }; + }; + expect(trajectory.traj_data.pred_files).toContain('src/a.ts'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + rmSync(repoPath, { recursive: true, force: true }); + rmSync(payloadDir, { recursive: true, force: true }); + rmSync(stubDir, { recursive: true, force: true }); + } + }); + + it('rejects Claude structured_output with fields outside the frozen schema', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createGitCheckout(); + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const payloadPath = path.join(payloadDir, 'task-payloads.json'); + const answer = { ...structuredStubAnswer(), unexpectedRoot: true }; + writeTaskPayloads(payloadPath, taskId, { + problem_statement: 'Reject schema drift from the executor output.', + repo_checkout_path: repoPath + }); + const { stubDir, env } = createClaudeStub( + JSON.stringify({ + type: 'result', + subtype: 'success', + is_error: false, + structured_output: answer + }) + ); + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1', + '--task-payloads', + payloadPath, + '--max-attempts', + '1' + ], + { encoding: 'utf8', env } + ); + const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude')); + expect(row).toMatchObject({ status: 'invalid_schema' }); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + structuredAnswerParseErrors: string[]; + }; + expect(rawTrace.structuredAnswerParseErrors).toEqual( + expect.arrayContaining(['additional_root_field_unexpectedRoot']) + ); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + rmSync(repoPath, { recursive: true, force: true }); + rmSync(payloadDir, { recursive: true, force: true }); + rmSync(stubDir, { recursive: true, force: true }); + } + }); + + it('blocks a real executor slot before spawn when task payloads are missing', () => { + const sessionRoot = tempSessionRoot('phase41'); + const taskId = manifest.tasks[0].instance_id; + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1', + '--max-attempts', + '1' + ], + { encoding: 'utf8' } + ); + const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude')); + expect(row).toMatchObject({ status: 'task_setup_failed' }); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + exitStatus: number | null; + taskContext: { errors: string[]; materialized: boolean }; + }; + expect(rawTrace.exitStatus).toBeNull(); + expect(rawTrace.taskContext.materialized).toBe(false); + expect(rawTrace.taskContext.errors).toEqual( + expect.arrayContaining([ + 'missing_task_payload', + 'missing_problem_statement', + 'missing_repo_checkout_path' + ]) + ); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('runs Codex, Gemini, and OpenCode adapters through the materialized task gate without paid calls', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createGitCheckout(); + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const payloadPath = path.join(payloadDir, 'adapter-task-payloads.json'); + const stubs: string[] = []; + try { + writeTaskPayloads(payloadPath, taskId, { + problem_statement: 'Fix the adapter smoke task with materialized input.', + repo_checkout_path: repoPath + }); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + const executors = ['codex', 'gemini', 'opencode'] as const; + for (const [index, executor] of executors.entries()) { + const cwdPath = path.join(payloadDir, `${executor}-cwd.txt`); + const argsPath = path.join(payloadDir, `${executor}-args.json`); + const { stubDir, env } = createAdapterStub(executor, { cwdPath, argsPath }); + stubs.push(stubDir); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + executor, + '--model', + 'stub', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + String(index + 1), + '--task-payloads', + payloadPath, + '--max-attempts', + '1', + '--timeout-ms', + '60000' + ], + { encoding: 'utf8', env } + ); + expect(readFileSync(cwdPath, 'utf8')).toBe(repoPath); + } + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { encoding: 'utf8' } + ); + const rows = readRows(sessionRoot).filter((row) => + executors.some((executor) => row.run_id.endsWith(`-${executor}`)) + ); + expect(rows).toHaveLength(3); + for (const row of rows) { + expect(row.status).toBe('completed'); + const rawTrace = JSON.parse(readFileSync(row.raw_trace_path, 'utf8')) as { + executor: string; + executorSchemaMode: string; + executorArgs: string[]; + taskContext: { materialized: boolean; verificationStrict: boolean }; + structuredAnswerParseErrors: string[]; + }; + expect(rawTrace.taskContext).toMatchObject({ + materialized: true, + verificationStrict: false + }); + expect(rawTrace.structuredAnswerParseErrors).toEqual([]); + if (rawTrace.executor === 'codex') { + expect(rawTrace.executorSchemaMode).toBe('native_schema'); + expect(rawTrace.executorArgs).toEqual(expect.arrayContaining(['--output-schema'])); + } else { + expect(rawTrace.executorSchemaMode).toBe('prompt_only'); + } + } + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + rmSync(repoPath, { recursive: true, force: true }); + rmSync(payloadDir, { recursive: true, force: true }); + for (const stubDir of stubs) rmSync(stubDir, { recursive: true, force: true }); + } + }); + + it('runs diagnostic codebase-context arms through the materialized task gate', () => { + const sessionRoot = tempSessionRoot('phase41'); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createGitCheckout(); + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const payloadPath = path.join(payloadDir, 'arm-task-payloads.json'); + const cwdCapturePath = path.join(payloadDir, 'arm-claude-cwd.txt'); + const stdinCapturePath = path.join(payloadDir, 'arm-claude-stdin.txt'); + const { stubDir, env } = createClaudeStub( + JSON.stringify({ + type: 'result', + subtype: 'success', + is_error: false, + structured_output: structuredStubAnswer() + }), + { cwdPath: cwdCapturePath, stdinPath: stdinCapturePath } + ); + try { + writeTaskPayloads(payloadPath, taskId, { + problem_statement: 'Run the diagnostic arm with materialized task text.', + repo_checkout_path: repoPath + }); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run-codebase-context-arms', + '--session', + sessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--task-id', + taskId, + '--repeats', + '1', + '--max-attempts', + '1', + '--task-payloads', + payloadPath + ], + { encoding: 'utf8', env } + ); + const row = readRows(sessionRoot).find( + (candidate) => candidate.scoring && 'baselineArmId' in candidate.scoring + ); + expect(row).toMatchObject({ status: 'completed' }); + expect(readFileSync(cwdCapturePath, 'utf8')).toBe(repoPath); + const stdin = readFileSync(stdinCapturePath, 'utf8'); + expect(stdin).toContain('Problem statement:'); + expect(stdin).toContain('Run the diagnostic arm with materialized task text.'); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + taskContext: { materialized: boolean; repoCheckoutPath: string }; + }; + expect(rawTrace.taskContext).toMatchObject({ + materialized: true, + repoCheckoutPath: repoPath + }); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + rmSync(repoPath, { recursive: true, force: true }); + rmSync(payloadDir, { recursive: true, force: true }); + rmSync(stubDir, { recursive: true, force: true }); + } + }); + + it('blocks a real executor slot before spawn when repo checkout is missing or at the wrong commit', () => { + const missingSessionRoot = tempSessionRoot(); + const wrongCommitSessionRoot = tempSessionRoot(); + const task = manifest.tasks[0]; + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const missingPayloadPath = path.join(payloadDir, 'missing-repo.json'); + const wrongCommitPayloadPath = path.join(payloadDir, 'wrong-commit.json'); + const wrongCommitRepo = createGitCheckout(); + try { + writeTaskPayloads(missingPayloadPath, task.instance_id, { + problem_statement: 'Problem text exists but the checkout does not.', + repo_checkout_path: path.join(payloadDir, 'does-not-exist') + }); + writeTaskPayloads(wrongCommitPayloadPath, task.instance_id, { + problem_statement: 'Problem text exists but the checkout commit is wrong.', + repo_checkout_path: wrongCommitRepo + }); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', missingSessionRoot], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + missingSessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + task.instance_id, + '--repeat', + '1', + '--task-payloads', + missingPayloadPath, + '--max-attempts', + '1' + ], + { encoding: 'utf8' } + ); + const missingRow = readRows(missingSessionRoot).find((candidate) => + candidate.run_id.endsWith('-claude') + ); + const missingTrace = JSON.parse(readFileSync(missingRow?.raw_trace_path ?? '', 'utf8')) as { + taskContext: { errors: string[] }; + }; + expect(missingRow).toMatchObject({ status: 'task_setup_failed' }); + expect(missingTrace.taskContext.errors).toContain('repo_checkout_missing'); + + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', wrongCommitSessionRoot], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + wrongCommitSessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + task.instance_id, + '--repeat', + '1', + '--task-payloads', + wrongCommitPayloadPath, + '--max-attempts', + '1' + ], + { encoding: 'utf8' } + ); + const wrongCommitRow = readRows(wrongCommitSessionRoot).find((candidate) => + candidate.run_id.endsWith('-claude') + ); + const wrongCommitTrace = JSON.parse( + readFileSync(wrongCommitRow?.raw_trace_path ?? '', 'utf8') + ) as { + taskContext: { errors: string[]; verificationStrict: boolean }; + }; + expect(wrongCommitRow).toMatchObject({ status: 'task_setup_failed' }); + expect(wrongCommitTrace.taskContext.verificationStrict).toBe(true); + expect(wrongCommitTrace.taskContext.errors).toEqual( + expect.arrayContaining(['base_commit_mismatch', 'problem_statement_hash_mismatch']) + ); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(missingSessionRoot)))), { + recursive: true, + force: true + }); + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(wrongCommitSessionRoot)))), { + recursive: true, + force: true + }); + rmSync(payloadDir, { recursive: true, force: true }); + rmSync(wrongCommitRepo, { recursive: true, force: true }); + } + }); + + it('blocks a real executor slot before spawn when the repo checkout is dirty', () => { + const sessionRoot = tempSessionRoot('phase41'); + const task = manifest.tasks[0]; + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const payloadPath = path.join(payloadDir, 'dirty-repo.json'); + const dirtyRepo = createGitCheckout(); + try { + writeFileSync(path.join(dirtyRepo, 'dirty.txt'), 'dirty checkout', 'utf8'); + writeTaskPayloads(payloadPath, task.instance_id, { + problem_statement: 'Problem text exists but the checkout has local changes.', + repo_checkout_path: dirtyRepo + }); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + task.instance_id, + '--repeat', + '1', + '--task-payloads', + payloadPath, + '--max-attempts', + '1' + ], + { encoding: 'utf8' } + ); + const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude')); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + taskContext: { errors: string[]; statusShort: string }; + }; + expect(row).toMatchObject({ status: 'task_setup_failed' }); + expect(rawTrace.taskContext.errors).toContain('repo_checkout_dirty'); + expect(rawTrace.taskContext.statusShort).toContain('dirty.txt'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + rmSync(payloadDir, { recursive: true, force: true }); + rmSync(dirtyRepo, { recursive: true, force: true }); + } + }); + + it('records Claude CLI rate limits as tool errors, not answer schema failures', () => { + const sessionRoot = tempSessionRoot(); + const taskId = manifest.tasks[0].instance_id; + const repoPath = createGitCheckout(); + const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-')); + const payloadPath = path.join(payloadDir, 'task-payloads-rate-limit.json'); + writeTaskPayloads(payloadPath, taskId, { + problem_statement: 'Fix the task; this test exercises rate-limit classification.', + repo_checkout_path: repoPath + }); + const { stubDir, env } = createClaudeStub( + "You've hit your limit · resets 8pm (Europe/Madrid)\n" + ); + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { + encoding: 'utf8' + } + ); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--baseline-run', + '--session', + sessionRoot, + '--executor', + 'claude', + '--model', + 'haiku', + '--lane', + 'raw-native', + '--task-id', + taskId, + '--repeat', + '1', + '--task-payloads', + payloadPath, + '--max-attempts', + '1' + ], + { encoding: 'utf8', env } + ); + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { + encoding: 'utf8' + } + ); + const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude')); + expect(row).toMatchObject({ status: 'tool_error' }); + const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as { + claudeDiagnostic: string; + structuredAnswerParseErrors: string[]; + }; + expect(rawTrace.claudeDiagnostic).toBe('claude_rate_limit'); + expect(rawTrace.structuredAnswerParseErrors).toEqual(['invalid_json', 'claude_rate_limit']); + const fallbackAnswer = JSON.parse( + readFileSync(row?.structured_answer_path ?? '', 'utf8') + ) as { + unsupportedClaims: string[]; + }; + expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + rmSync(repoPath, { recursive: true, force: true }); + rmSync(payloadDir, { recursive: true, force: true }); + rmSync(stubDir, { recursive: true, force: true }); + } + }); + + it('uses the same required fields for parser validation', () => { + const invalid = Object.fromEntries( + CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS.filter((field) => field !== 'readyToEdit').map( + (field) => [field, field === 'evidence' ? [] : field === 'confidence' ? 'medium' : []] + ) + ); + const parsed = parseStructuredAnswer(JSON.stringify(invalid)); + expect(parsed).toMatchObject({ status: 'invalid_schema' }); + expect(parsed.errors).toContain('missing_readyToEdit'); + }); +}); diff --git a/tests/contextbench-baseline-snapshot.test.ts b/tests/contextbench-baseline-snapshot.test.ts new file mode 100644 index 0000000..1061826 --- /dev/null +++ b/tests/contextbench-baseline-snapshot.test.ts @@ -0,0 +1,133 @@ +import { execFileSync } from 'node:child_process'; +import { mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { describe, expect, it, vi } from 'vitest'; + +type BaselineSession = { + claimBearing: boolean; + sealed: boolean; + sessionHash: string; + snapshot: { + branch: string; + head: string; + divergence: { status: string }; + gitStatusPath: string; + trackedDiffPath: string; + stagedDiffPath: string; + diffStatPath: string; + untracked: Array<{ path: string; disposition: string; hash: string | null; exclusionReason: string | null }>; + lockfiles: Array<{ path: string; hash: string }>; + redactedEnvVarNames: string[]; + versions: Record; + fixtureHashes: Record; + commandTranscript: Array<{ command: string; stdoutPath: string | null; stderrPath: string | null }>; + snapshotHash: string; + }; + artifactIndex: Array<{ path: string; hash: string }>; +}; + +vi.setConfig({ testTimeout: 30000 }); + +function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { + return path.join( + mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-`)), + 'benchmark-runs', + 'contextbench', + phase, + 'snapshot-smoke' + ); +} + +describe('ContextBench Phase 40 dirty-worktree snapshot', () => { + it('captures the current checkout before baseline runs with hashes and validation metadata', () => { + const sessionRoot = tempSessionRoot(); + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + const validateOutput = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { encoding: 'utf8' } + ); + expect(validateOutput).toContain('baseline session validation passed'); + + const session = JSON.parse( + readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8') + ) as BaselineSession & { phase: number }; + expect(session.phase).toBe(40); + expect(session.claimBearing).toBe(false); + expect(session.sealed).toBe(false); + expect(session.sessionHash).toMatch(/^sha256:[a-f0-9]{64}$/); + expect(session.snapshot.branch.length).toBeGreaterThan(0); + expect(session.snapshot.head).toMatch(/^[a-f0-9]{40}$/); + expect(session.snapshot.divergence.status).toBe('unavailable'); + expect(session.snapshot.snapshotHash).toMatch(/^sha256:[a-f0-9]{64}$/); + expect(session.snapshot.gitStatusPath).toBe('snapshot/git/status-porcelain-v2.txt'); + expect(session.snapshot.trackedDiffPath).toBe('snapshot/git/tracked.diff'); + expect(session.snapshot.stagedDiffPath).toBe('snapshot/git/staged.diff'); + expect(session.snapshot.diffStatPath).toBe('snapshot/git/diff-stat.txt'); + expect(session.snapshot.lockfiles.map((entry) => entry.path)).toContain( + path.relative(sessionRoot, path.resolve('pnpm-lock.yaml')).replace(/\\/g, '/') + ); + expect(session.snapshot.fixtureHashes.protocol).toMatch(/^sha256:[a-f0-9]{64}$/); + expect(session.snapshot.commandTranscript.map((entry) => entry.command)).toEqual( + expect.arrayContaining(['git status --porcelain=v2 --branch --untracked-files=all', 'git diff --no-ext-diff']) + ); + expect(session.artifactIndex.map((entry) => entry.path)).toEqual( + expect.arrayContaining(['slot-reservations.json', 'run-manifest.jsonl']) + ); + expect(JSON.stringify(session)).not.toContain(process.env.OPENAI_API_KEY ?? 'definitely-not-present'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('captures Phase 41 baseline snapshots with Phase 41 metadata', () => { + const sessionRoot = tempSessionRoot('phase41'); + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + const validateOutput = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { encoding: 'utf8' } + ); + expect(validateOutput).toContain('baseline session validation passed'); + + const session = JSON.parse( + readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8') + ) as BaselineSession & { phase: number }; + expect(session.phase).toBe(41); + expect(session.sessionRoot).toContain('/phase41/'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + + it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => { + const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-')); + try { + expect(() => + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', outDir], { + encoding: 'utf8', + stdio: 'pipe' + }) + ).toThrow(/benchmark-runs\/contextbench\/phase40/); + } finally { + rmSync(outDir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/contextbench-lane-setup.test.ts b/tests/contextbench-lane-setup.test.ts new file mode 100644 index 0000000..34290e0 --- /dev/null +++ b/tests/contextbench-lane-setup.test.ts @@ -0,0 +1,156 @@ +import { describe, expect, it } from 'vitest'; +import { + CONTEXTBENCH_LANE_READINESS_STATUSES, + type ContextBenchLane, + type ContextBenchLaneSetupEvidenceFixture, + type ContextBenchLaneSetupEvidenceRecord, + type ContextBenchLaneToolCard +} from '../src/eval/contextbench-types.js'; +import { hashSetupEvidenceRecord } from '../src/eval/contextbench-artifacts.js'; +import laneSetupEvidenceFixture from './fixtures/contextbench-lane-setup-evidence.json'; +import laneToolCardsFixture from './fixtures/contextbench-lane-tool-cards.json'; +import lanesFixture from './fixtures/contextbench-lanes.json'; +import packageJson from '../package.json'; +import protocolFixture from './fixtures/contextbench-benchmark-protocol.json'; + +type LanesFixture = { + broadClaimLaneSet: string[]; + lanes: ContextBenchLane[]; + setupFailureSemantics: { + winEligible: boolean; + claimContribution: string; + includedInPublicationRows: boolean; + blocksBroadClaimsForRequiredLane: boolean; + }; +}; + +type LaneToolCardsFixture = { + cards: ContextBenchLaneToolCard[]; +}; + +type PackageFixture = { + dependencies?: Record; + devDependencies?: Record; +}; + +const lanes = lanesFixture as LanesFixture; +const laneToolCards = laneToolCardsFixture as LaneToolCardsFixture; +const setupEvidence = laneSetupEvidenceFixture as ContextBenchLaneSetupEvidenceFixture; +const packageFixture = packageJson as PackageFixture; +const blockedStatuses = new Set(['setup_failed', 'index_failed', 'tool_error', 'invasive_setup_blocked']); + +function byLane(items: T[]): Map { + return new Map(items.map((item) => [item.laneId, item])); +} + +function hasPendingPlaceholder(card: ContextBenchLaneToolCard): boolean { + return [card.setupCommand, card.indexCommand, card.queryCommand, card.versionCommand].some((command) => + command.toLowerCase().includes('pending phase 39') + ); +} + +function expectTerminalBlockedRecord(record: ContextBenchLaneSetupEvidenceRecord): void { + expect(blockedStatuses.has(record.readinessStatus)).toBe(true); + expect(record.logReference).toMatch(/^outputs\/contextbench\/setup\//); + expect(record.nextHumanAction.length).toBeGreaterThan(20); + expect(record.commands.some((command) => command.status === 'blocked' || command.status === 'failed')).toBe(true); + expect(record.commands.some((command) => command.stdoutLogPath || command.stderrLogPath || command.outputHash)).toBe(true); +} + +describe('ContextBench Phase 39 lane setup evidence', () => { + it('covers every required lane with a final non-pending readiness record', () => { + const evidenceByLane = byLane(setupEvidence.records); + const cardsByLane = byLane(laneToolCards.cards); + + for (const laneId of lanes.broadClaimLaneSet) { + const record = evidenceByLane.get(laneId); + const card = cardsByLane.get(laneId); + expect(record, `missing setup evidence for ${laneId}`).toBeTruthy(); + expect(card, `missing lane card for ${laneId}`).toBeTruthy(); + if (!record || !card) continue; + expect(record.readinessStatus).not.toBe('pending'); + expect(CONTEXTBENCH_LANE_READINESS_STATUSES).toContain(record.readinessStatus); + expect(card.phase39Status).toBe(record.readinessStatus); + expect(record.claimBearing).toBe(false); + expect(record.commands.map((command) => command.kind).sort()).toEqual([ + 'index', + 'query', + 'setup', + 'version' + ]); + } + }); + + it('rejects unresolved Phase 39 placeholders unless there is terminal blocker evidence', () => { + const evidenceByLane = byLane(setupEvidence.records); + for (const card of laneToolCards.cards) { + const record = evidenceByLane.get(card.laneId); + expect(record).toBeTruthy(); + if (!record) continue; + if (hasPendingPlaceholder(card)) { + expectTerminalBlockedRecord(record); + } + expect(hasPendingPlaceholder(card)).toBe(false); + } + }); + + it('keeps setup/index cost and status separate from task execution metadata', () => { + for (const record of setupEvidence.records) { + expect(record.setupStatus).toBeTruthy(); + expect(record.indexStatus).toBeTruthy(); + expect(record).not.toHaveProperty('taskWallTimeMs'); + expect(record.commands.every((command) => command.durationMs === null || command.durationMs >= 0)).toBe(true); + expect(record.setupDurationMs === null || record.setupDurationMs >= 0).toBe(true); + expect(record.indexDurationMs === null || record.indexDurationMs >= 0).toBe(true); + } + }); + + it('records blocked and failed lanes as terminal missing evidence, not wins', () => { + expect(lanes.setupFailureSemantics.winEligible).toBe(false); + expect(lanes.setupFailureSemantics.claimContribution).toBe('missing_evidence'); + expect(lanes.setupFailureSemantics.includedInPublicationRows).toBe(true); + expect(lanes.setupFailureSemantics.blocksBroadClaimsForRequiredLane).toBe(true); + + const blockedRecords = setupEvidence.records.filter((record) => blockedStatuses.has(record.readinessStatus)); + expect(blockedRecords.map((record) => record.laneId).sort()).toEqual([ + 'codebase-memory-mcp', + 'grepai' + ]); + for (const record of blockedRecords) expectTerminalBlockedRecord(record); + }); + + it('preserves one-context-tool isolation for non-raw lanes', () => { + const cardsByLane = byLane(laneToolCards.cards); + for (const lane of lanes.lanes) { + const card = cardsByLane.get(lane.laneId); + expect(card).toBeTruthy(); + if (!card || lane.laneId === 'raw-native') continue; + expect(card.contextTools).toEqual([lane.contextTool]); + expect(card.allowedTools).toEqual([lane.contextTool]); + expect(card.disallowedTools).toEqual(expect.arrayContaining(['native-read', 'native-search', 'native-shell-readonly'])); + } + }); + + it('keeps competitor tools out of package runtime dependencies', () => { + const runtimeDependencies = Object.keys(packageFixture.dependencies ?? {}); + const devDependencies = Object.keys(packageFixture.devDependencies ?? {}); + const forbiddenPackages = ['jcodemunch-mcp', 'grepai', 'codebase-memory-mcp', 'codegraphcontext', 'kuzu']; + for (const dependencyName of [...runtimeDependencies, ...devDependencies]) { + expect(forbiddenPackages).not.toContain(dependencyName.toLowerCase()); + } + }); + + it('keeps Phase 39 setup/probe evidence non-claim-bearing', () => { + expect(protocolFixture.claimAllowed).toBe(false); + expect(setupEvidence.claimBearing).toBe(false); + expect(setupEvidence.generatedOutputsPolicy).toContain('not Phase 40 baseline artifacts'); + expect(setupEvidence.records.every((record) => record.claimBearing === false)).toBe(true); + }); + + it('can hash setup evidence records without using fixture mutation as proof', () => { + for (const record of setupEvidence.records) { + expect(record.evidenceHash).toBeTruthy(); + expect(hashSetupEvidenceRecord(record)).toMatch(/^sha256:[a-f0-9]{64}$/); + } + }); +}); diff --git a/tests/contextbench-phase42-evidence-gate.test.ts b/tests/contextbench-phase42-evidence-gate.test.ts new file mode 100644 index 0000000..96ee7d2 --- /dev/null +++ b/tests/contextbench-phase42-evidence-gate.test.ts @@ -0,0 +1,372 @@ +import { describe, expect, it } from 'vitest'; +import { + evaluateContextBenchEvidenceGate, + type ContextBenchEvidenceGateInput, + type ContextBenchRunEvidenceArtifacts +} from '../src/eval/contextbench-evidence-gate.js'; +import type { ContextBenchRunManifestRow } from '../src/eval/contextbench-types.js'; + +const runnerHash = 'sha256:1111111111111111111111111111111111111111111111111111111111111111'; +const protocolHash = 'sha256:protocol'; +const taskManifestHash = 'sha256:manifest'; +const scoreHash = 'sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc'; +const officialOutputHash = 'sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +const stdoutHash = 'sha256:dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd'; +const stderrHash = 'sha256:eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee'; + +function baseRow(overrides: Partial = {}): ContextBenchRunManifestRow { + return { + run_id: 'codebase-context-task-1-1-claude', + protocol_version: 'contextbench-protocol-v1', + protocol_hash: protocolHash, + task_manifest_hash: taskManifestHash, + lane_id: 'codebase-context', + task_id: 'task-1', + repeat_index: 1, + status: 'completed', + started_at: '2026-04-29T00:00:00.000Z', + completed_at: '2026-04-29T00:00:05.000Z', + raw_trace_path: 'runs/codebase-context-task-1-1-claude/raw-trace.json', + structured_answer_path: 'runs/codebase-context-task-1-1-claude/structured-answer.json', + trajectory_path: 'runs/codebase-context-task-1-1-claude/trajectory.json', + score_path: 'runs/codebase-context-task-1-1-claude/score.json', + setup_index_path: 'runs/codebase-context-task-1-1-claude/setup-index.json', + prompt_path: 'runs/codebase-context-task-1-1-claude/prompt.txt', + lane_tool_card_path: 'runs/codebase-context-task-1-1-claude/lane-card.json', + setupIndex: { + setupCommand: 'npx codebase-context index', + indexCommand: 'npx codebase-context index', + setupDurationMs: 120, + indexDurationMs: 340, + setupLogPath: 'runs/codebase-context-task-1-1-claude/setup.log', + indexLogPath: 'runs/codebase-context-task-1-1-claude/index.log', + setupStatus: 'completed', + indexStatus: 'completed' + }, + taskExecution: { + model: 'claude-sonnet-4-5', + timeoutSeconds: 600, + maxContextTokens: 120000, + maxAnswerTokens: 4000, + startedAt: '2026-04-29T00:00:00.000Z', + completedAt: '2026-04-29T00:00:05.000Z', + taskWallTimeMs: 5000, + executor: 'claude' + }, + scoring: { + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true, + command: 'python -m contextbench.evaluate --gold gold.parquet --pred trajectory.json --out official.jsonl', + claimBearing: true + }, + hashes: { + runnerSourceHash: runnerHash + }, + ...overrides + }; +} + +function passingArtifacts(overrides: Partial = {}): ContextBenchRunEvidenceArtifacts { + return { + rawTrace: { + executor: 'claude', + model: 'claude-sonnet-4-5', + runnerHash + }, + score: { + status: 'completed', + mode: 'official_evaluator', + claimBearing: true, + officialEvaluatorInvoked: true, + command: 'python -m contextbench.evaluate --gold gold.parquet --pred trajectory.json --out official.jsonl', + exitCode: 0, + outputPath: 'runs/codebase-context-task-1-1-claude/official-results.jsonl', + outputHash: officialOutputHash, + stdoutPath: 'runs/codebase-context-task-1-1-claude/official.stdout.log', + stderrPath: 'runs/codebase-context-task-1-1-claude/official.stderr.log' + }, + setupIndex: { + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 120, + indexDurationMs: 340, + setupLogPath: 'runs/codebase-context-task-1-1-claude/setup.log', + indexLogPath: 'runs/codebase-context-task-1-1-claude/index.log' + }, + laneIsolation: { + laneId: 'codebase-context', + proven: true, + sourceKind: 'proxy', + expectedContextTool: 'codebase-context', + allowedTools: ['codebase-context'], + observedTools: ['codebase-context'] + }, + ...overrides + }; +} + +function passingInput(overrides: Partial = {}): ContextBenchEvidenceGateInput { + const row = baseRow(); + return { + evidenceMode: 'artifact_verified', + protocol: { + claimAllowed: true, + benchmarkTarget: { + officialEvaluatorFirst: true + } + }, + requiredLaneIds: ['codebase-context'], + requiredTaskIds: ['task-1'], + requiredRepeats: 1, + expectedTotalRows: 1, + expectedProtocolHash: protocolHash, + expectedTaskManifestHash: taskManifestHash, + lanePoliciesById: { + 'codebase-context': { + laneId: 'codebase-context', + expectedContextTool: 'codebase-context', + allowedTools: ['codebase-context'], + disallowedTools: ['native-read', 'native-search', 'native-shell-readonly'] + } + }, + rows: [row], + artifactsByRunId: { + [row.run_id]: passingArtifacts() + }, + artifactHashesByPath: { + [row.score_path]: scoreHash, + 'runs/codebase-context-task-1-1-claude/official-results.jsonl': officialOutputHash, + 'runs/codebase-context-task-1-1-claude/official.stdout.log': stdoutHash, + 'runs/codebase-context-task-1-1-claude/official.stderr.log': stderrHash + }, + expectedRunnerHash: runnerHash, + currentRunnerHash: runnerHash, + ...overrides + }; +} + +function failureCodes(input: ContextBenchEvidenceGateInput): string[] { + return evaluateContextBenchEvidenceGate(input).failures.map((failure) => failure.code); +} + +describe('ContextBench Phase 42 evidence gate', () => { + it('allows synthetic shape validation but never treats it as claim-pass', () => { + const result = evaluateContextBenchEvidenceGate( + passingInput({ evidenceMode: 'synthetic_shape' }) + ); + expect(result.shapePass).toBe(true); + expect(result.claimPass).toBe(false); + expect(result.diagnosticOnly).toBe(true); + expect(result.failures.map((failure) => failure.code)).toEqual(['artifact_verification_missing']); + }); + + it('rejects synthetic evidence when official evaluator invocation is missing', () => { + const row = baseRow({ + scoring: { + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: false, + command: 'python -m contextbench.evaluate', + claimBearing: false, + fallbackReason: 'official_evaluator_not_invoked' + } + }); + const input = passingInput({ + rows: [row], + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + score: { + status: 'judge_failed', + mode: 'diagnostic_fallback', + claimBearing: false, + officialEvaluatorInvoked: false, + command: 'python -m contextbench.evaluate', + exitCode: 1, + outputPath: 'official-results.jsonl', + outputHash: 'sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb', + stdoutPath: 'official.stdout.log', + stderrPath: 'official.stderr.log' + } + }) + }, + artifactHashesByPath: {} + }); + expect(failureCodes(input)).toEqual( + expect.arrayContaining(['official_evaluator_missing', 'diagnostic_fallback_only']) + ); + }); + + it('rejects synthetic evidence when lane isolation proof is missing', () => { + const row = baseRow(); + const input = passingInput({ + artifactsByRunId: { + [row.run_id]: passingArtifacts({ laneIsolation: undefined }) + } + }); + expect(failureCodes(input)).toContain('lane_isolation_missing'); + }); + + it('rejects synthetic evidence when lane isolation telemetry is empty', () => { + const row = baseRow(); + const input = passingInput({ + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + laneIsolation: { + laneId: 'codebase-context', + proven: true, + sourceKind: 'proxy', + expectedContextTool: 'codebase-context', + allowedTools: ['codebase-context'], + observedTools: [] + } + }) + } + }); + expect(failureCodes(input)).toContain('lane_isolation_missing'); + }); + + it('rejects synthetic evidence when ready lane setup/index evidence is missing', () => { + const row = baseRow(); + const input = passingInput({ + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + setupIndex: { + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 0, + indexDurationMs: 0, + setupLogPath: 'setup.log', + indexLogPath: 'index.log' + } + }) + } + }); + expect(failureCodes(input)).toContain('setup_index_cost_missing'); + }); + + it('rejects synthetic evidence when runner provenance does not match', () => { + const input = passingInput({ + currentRunnerHash: 'sha256:2222222222222222222222222222222222222222222222222222222222222222' + }); + expect(failureCodes(input)).toContain('runner_provenance_mismatch'); + }); + + it('rejects duplicate and unexpected rows so the denominator cannot be narrowed', () => { + const row = baseRow(); + const duplicate = baseRow({ run_id: 'duplicate-run' }); + const unexpected = baseRow({ run_id: 'unexpected-run', task_id: 'task-outside-denominator' }); + const input = passingInput({ + rows: [row, duplicate, unexpected], + expectedTotalRows: 1, + artifactsByRunId: { + [row.run_id]: passingArtifacts(), + [duplicate.run_id]: passingArtifacts(), + [unexpected.run_id]: passingArtifacts() + } + }); + expect(failureCodes(input)).toEqual( + expect.arrayContaining(['duplicate_required_run', 'unexpected_run_row']) + ); + }); + + it('rejects evidence when row count is narrower than the frozen denominator', () => { + const input = passingInput({ expectedTotalRows: 2 }); + expect(failureCodes(input)).toContain('denominator_count_mismatch'); + }); + + it('rejects setup/index evidence that contradicts the manifest row', () => { + const row = baseRow(); + const input = passingInput({ + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + setupIndex: { + setupStatus: 'completed', + indexStatus: 'completed', + setupDurationMs: 999, + indexDurationMs: 340, + setupLogPath: 'runs/codebase-context-task-1-1-claude/setup.log', + indexLogPath: 'runs/codebase-context-task-1-1-claude/index.log' + } + }) + } + }); + expect(failureCodes(input)).toContain('setup_index_cost_missing'); + }); + + it('rejects self-attested official evaluator proof without command output artifacts', () => { + const row = baseRow(); + const input = passingInput({ + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + score: { + status: 'completed', + mode: 'official_evaluator', + claimBearing: true, + officialEvaluatorInvoked: true + } + }) + } + }); + expect(failureCodes(input)).toContain('official_evaluator_missing'); + }); + + it('passes artifact-verified evidence with official evaluator, lane isolation, setup/index, and matching runner provenance', () => { + const result = evaluateContextBenchEvidenceGate(passingInput()); + expect(result).toEqual({ + shapePass: true, + claimPass: true, + diagnosticOnly: false, + failures: [] + }); + }); + + it('allows raw-native policy to prove multiple native observations without collapsing them into one fake tool', () => { + const row = baseRow({ lane_id: 'raw-native', run_id: 'raw-native-task-1-1-claude' }); + const input = passingInput({ + requiredLaneIds: ['raw-native'], + rows: [row], + lanePoliciesById: { + 'raw-native': { + laneId: 'raw-native', + expectedContextTool: 'native-agent-tools', + allowedTools: ['native-read', 'native-search', 'native-shell-readonly'], + disallowedTools: ['codebase-context'], + allowMultipleObservedTools: true + } + }, + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + laneIsolation: { + laneId: 'raw-native', + proven: true, + sourceKind: 'proxy', + expectedContextTool: 'native-agent-tools', + allowedTools: ['native-read', 'native-search', 'native-shell-readonly'], + observedTools: ['native-read', 'native-search'] + } + }) + } + }); + expect(evaluateContextBenchEvidenceGate(input).claimPass).toBe(true); + }); + + it('rejects env-injected lane telemetry for artifact-verified claim pass', () => { + const row = baseRow(); + const input = passingInput({ + artifactsByRunId: { + [row.run_id]: passingArtifacts({ + laneIsolation: { + laneId: 'codebase-context', + proven: true, + sourceKind: 'env_override', + expectedContextTool: 'codebase-context', + allowedTools: ['codebase-context'], + observedTools: ['codebase-context'] + } + }) + } + }); + expect(failureCodes(input)).toContain('lane_isolation_missing'); + }); +}); diff --git a/tests/contextbench-runner-contract.test.ts b/tests/contextbench-runner-contract.test.ts new file mode 100644 index 0000000..76455ab --- /dev/null +++ b/tests/contextbench-runner-contract.test.ts @@ -0,0 +1,321 @@ +import { execFileSync } from 'node:child_process'; +import { mkdtempSync, readFileSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { + CONTEXTBENCH_TERMINAL_STATUSES, + type ContextBenchLane, + type ContextBenchLaneToolCard, + type ContextBenchProtocol, + type ContextBenchRunManifestRow, + type ContextBenchTaskManifest +} from '../src/eval/contextbench-types.js'; +import { + appendManifestRow, + buildManifestRow, + buildRunId, + createArtifactPathSet, + hashJson, + readManifestRows, + writeJsonArtifact +} from '../src/eval/contextbench-artifacts.js'; +import { + classifyStructuredAnswer, + parseStructuredAnswer +} from '../src/eval/contextbench-answer.js'; +import correctionsFixture from './fixtures/contextbench-corrections.json'; +import laneToolCardsFixture from './fixtures/contextbench-lane-tool-cards.json'; +import lanesFixture from './fixtures/contextbench-lanes.json'; +import manifestFixture from './fixtures/contextbench-task-manifest.json'; +import protocolFixture from './fixtures/contextbench-benchmark-protocol.json'; + +type LaneToolCardsFixture = { + protocolVersion: string; + cards: ContextBenchLaneToolCard[]; +}; + +type LanesFixture = { + broadClaimLaneSet: string[]; + lanes: ContextBenchLane[]; + laneToolCardRequiredFields: string[]; +}; + +const protocol = protocolFixture as ContextBenchProtocol; +const manifest = manifestFixture as ContextBenchTaskManifest; +const lanes = lanesFixture as LanesFixture; +const laneToolCards = laneToolCardsFixture as LaneToolCardsFixture; +const corrections = correctionsFixture as { + policy: { anyFixtureChangeRequiresCorrection: boolean }; +}; + +function tempDir(): string { + return mkdtempSync(path.join(tmpdir(), 'contextbench-runner-')); +} + +describe('ContextBench Phase 38 runner contract', () => { + it('keeps frozen task/protocol inputs read-only and correction-governed', () => { + expect(protocol.claimAllowed).toBe(false); + expect(protocol.benchmarkTarget.officialEvaluatorFirst).toBe(true); + expect(manifest.tasks).toHaveLength(20); + expect(manifest.manifest_hash).toMatch(/^sha256:[a-f0-9]{64}$/); + expect(manifest.no_lane_outputs_observed_attestation).toContain('No raw/native'); + expect(corrections.policy.anyFixtureChangeRequiresCorrection).toBe(true); + expect(laneToolCards.protocolVersion).toBe(protocol.protocolVersion); + }); + + it('defines explicit lane cards for every required lane while only raw/native and codebase-context are Phase 38 executable', () => { + const cardsByLane = new Map(laneToolCards.cards.map((card) => [card.laneId, card])); + for (const laneId of lanes.broadClaimLaneSet) { + expect(cardsByLane.has(laneId)).toBe(true); + } + expect(cardsByLane.get('raw-native')?.executableInPhase38).toBe(true); + expect(cardsByLane.get('codebase-context')?.executableInPhase38).toBe(true); + expect(cardsByLane.get('jcodemunch-repomapper')?.phase38Status).toBe('pending_phase39_setup'); + expect(cardsByLane.get('grepai')?.executableInPhase38).toBe(false); + }); + + it('enforces one-context-tool semantics and setup/index cost separation through lane cards', () => { + for (const lane of lanes.lanes) { + const card = laneToolCards.cards.find((candidate) => candidate.laneId === lane.laneId); + expect(card).toBeTruthy(); + if (!card) continue; + for (const field of lanes.laneToolCardRequiredFields) { + expect(card[field as keyof ContextBenchLaneToolCard]).toBeTruthy(); + } + expect(card.setupCostReportedSeparately).toBe(true); + expect(card.indexCostReportedSeparately).toBe(true); + expect(card.disallowedTools).not.toContain(lane.contextTool); + if (lane.laneId === 'raw-native') { + expect(card.contextTools).toEqual(['native-agent-tools']); + } else { + expect(card.contextTools).toEqual([lane.contextTool]); + expect(card.allowedTools).toEqual([lane.contextTool]); + } + } + }); + + it('keeps every protocol terminal status represented in the typed contract', () => { + expect(CONTEXTBENCH_TERMINAL_STATUSES).toEqual(protocol.runManifestSchema.terminalStatuses); + expect(CONTEXTBENCH_TERMINAL_STATUSES).toEqual( + expect.arrayContaining([ + 'setup_failed', + 'index_failed', + 'invalid_schema', + 'false_ready', + 'judge_failed' + ]) + ); + }); + + it('validates structured answers and maps malformed answers to invalid_schema', () => { + expect(parseStructuredAnswer('not json')).toMatchObject({ status: 'invalid_schema' }); + expect(parseStructuredAnswer(JSON.stringify({ answer: 'missing fields' }))).toMatchObject({ + status: 'invalid_schema' + }); + const parsed = parseStructuredAnswer( + JSON.stringify({ + answer: 'ready', + confidence: 'medium', + evidence: [ + { file: 'src/a.ts', lineRange: { start: 1, end: 2 }, reason: 'direct evidence' } + ], + filesReferenced: ['src/a.ts'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }) + ); + expect(parsed.status).toBe('valid'); + }); + + it('rejects structured answer fields outside the frozen schema', () => { + const validAnswer = { + answer: 'ready', + confidence: 'medium', + evidence: [ + { file: 'src/a.ts', lineRange: { start: 1, end: 2 }, reason: 'direct evidence' } + ], + filesReferenced: ['src/a.ts'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }; + + expect(parseStructuredAnswer(JSON.stringify({ ...validAnswer, extra: true }))).toMatchObject({ + status: 'invalid_schema', + errors: expect.arrayContaining(['additional_root_field_extra']) + }); + expect( + parseStructuredAnswer( + JSON.stringify({ + ...validAnswer, + evidence: [{ ...validAnswer.evidence[0], extraEvidence: true }] + }) + ) + ).toMatchObject({ + status: 'invalid_schema', + errors: expect.arrayContaining(['additional_evidence_field_extraEvidence']) + }); + expect( + parseStructuredAnswer( + JSON.stringify({ + ...validAnswer, + evidence: [ + { + ...validAnswer.evidence[0], + lineRange: { ...validAnswer.evidence[0].lineRange, extraLine: true } + } + ] + }) + ) + ).toMatchObject({ + status: 'invalid_schema', + errors: expect.arrayContaining(['additional_line_range_field_extraLine']) + }); + }); + + it('classifies false-ready from deterministic diagnostics, not just model self-report', () => { + const parsed = parseStructuredAnswer( + JSON.stringify({ + answer: 'safe to edit', + confidence: 'high', + evidence: [ + { file: 'src/a.ts', lineRange: { start: 1, end: 2 }, reason: 'partial evidence' } + ], + filesReferenced: ['src/a.ts'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: true + }) + ); + expect(parsed.answer).not.toBeNull(); + if (!parsed.answer) return; + const classification = classifyStructuredAnswer(parsed.answer, { + missingRequiredFacts: ['required fact absent'], + missingEvidenceFiles: ['src/required.ts'] + }); + expect(classification.unsupportedClaim).toBe(true); + expect(classification.falseReady).toBe(true); + expect(classification.reasons).toEqual( + expect.arrayContaining(['missing_required_facts', 'missing_evidence_files']) + ); + }); + + it('writes append-only manifest rows with artifact paths for attempted runs', () => { + const outDir = tempDir(); + try { + const runId = buildRunId({ + laneId: 'raw-native', + taskId: manifest.tasks[0].instance_id, + repeatIndex: 1, + executor: 'fake' + }); + const paths = createArtifactPathSet(outDir, runId); + const laneCard = laneToolCards.cards[0]; + const task = manifest.tasks[0]; + writeJsonArtifact(paths.rawTracePath, { stdout: '{}', stderr: '' }); + writeJsonArtifact(paths.structuredAnswerPath, { answer: 'x' }); + writeJsonArtifact(paths.trajectoryPath, { pred_files: [] }); + writeJsonArtifact(paths.scorePath, { claimBearing: false }); + const row = buildManifestRow({ + runId, + protocolVersion: protocol.protocolVersion, + protocolHash: hashJson(protocol), + taskManifestHash: manifest.manifest_hash, + laneCard, + task, + repeatIndex: 1, + status: 'completed', + startedAt: '2026-04-27T00:00:00.000Z', + completedAt: '2026-04-27T00:00:01.000Z', + paths, + hashes: { protocol: hashJson(protocol) }, + executor: 'fake', + model: 'fake-executor', + timeoutSeconds: protocol.budgets.defaults.timeoutSeconds, + maxContextTokens: protocol.budgets.defaults.maxContextTokens, + maxAnswerTokens: protocol.budgets.defaults.maxAnswerTokens + }); + appendManifestRow(paths.manifestPath, row); + appendManifestRow(paths.manifestPath, { + ...row, + run_id: `${runId}-2`, + status: 'invalid_schema' + }); + const rows = readManifestRows(paths.manifestPath); + expect(rows).toHaveLength(2); + expect(rows[1].status).toBe('invalid_schema'); + expect(rows[0].setupIndex.setupCommand).toBe(laneCard.setupCommand); + } finally { + rmSync(outDir, { recursive: true, force: true }); + } + }); + + it('validates fixtures and produces fake-executor smoke artifacts without live Claude', () => { + const outDir = tempDir(); + try { + const validateOutput = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--validate-fixtures'], + { + encoding: 'utf8' + } + ); + expect(validateOutput).toContain('fixture validation passed'); + execFileSync( + 'node', + [ + 'scripts/contextbench-runner.mjs', + '--dry-run', + '--executor', + 'fake', + '--lane', + 'raw-native', + '--task-id', + manifest.tasks[0].instance_id, + '--repeat', + '1', + '--out', + outDir + ], + { encoding: 'utf8' } + ); + const manifestRows = readFileSync(path.join(outDir, 'run-manifest.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as ContextBenchRunManifestRow); + expect(manifestRows).toHaveLength(1); + expect(manifestRows[0]).toMatchObject({ + lane_id: 'raw-native', + status: 'completed', + task_id: manifest.tasks[0].instance_id + }); + expect(readFileSync(manifestRows[0].raw_trace_path, 'utf8')).toContain('fake'); + expect(readFileSync(manifestRows[0].score_path, 'utf8')).toContain('claimBearing'); + expect(manifestRows[0].scoring.claimBearing).toBe(false); + expect(manifestRows[0].scoring.officialEvaluatorFirst).toBe(false); + expect(manifestRows[0].scoring.officialEvaluatorAttempted).toBe(false); + expect(manifestRows[0].scoring.officialEvaluatorInvoked).toBe(false); + expect(manifestRows).toHaveLength(1); + } finally { + rmSync(outDir, { recursive: true, force: true }); + } + }); + + it('exposes Phase 39 lane setup validation as readiness evidence only', () => { + const validateOutput = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--validate-lane-setup'], + { encoding: 'utf8' } + ); + expect(validateOutput).toContain('lane setup validation passed'); + + const helpOutput = execFileSync('node', ['scripts/contextbench-runner.mjs', '--help'], { + encoding: 'utf8' + }); + expect(helpOutput).toContain('Phase 39 boundary'); + expect(helpOutput).toContain('Phase 40 owns dirty-worktree baseline capture'); + expect(helpOutput).toContain('claimBearing=false'); + }); +}); diff --git a/tests/contextbench-scoring.test.ts b/tests/contextbench-scoring.test.ts new file mode 100644 index 0000000..5e4c2a6 --- /dev/null +++ b/tests/contextbench-scoring.test.ts @@ -0,0 +1,97 @@ +import { mkdtempSync, rmSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { parseStructuredAnswer } from '../src/eval/contextbench-answer.js'; +import { + runFactRecallDiagnostics, + scoreWithOfficialEvaluatorFirst, + type ContextBenchProcessRunner +} from '../src/eval/contextbench-scoring.js'; + +function tempDir(): string { + return mkdtempSync(path.join(tmpdir(), 'contextbench-scoring-')); +} + +describe('ContextBench official-evaluator-first scoring', () => { + it('invokes the official evaluator command through an injected runner', async () => { + const outDir = tempDir(); + const calls: Array<{ command: string; args: string[] }> = []; + const runner: ContextBenchProcessRunner = async (command, args) => { + calls.push({ command, args }); + return { status: 0, stdout: 'ok', stderr: '' }; + }; + try { + const result = await scoreWithOfficialEvaluatorFirst({ + goldPath: path.join(outDir, 'gold.parquet'), + predictionPath: path.join(outDir, 'trajectory.json'), + outputPath: path.join(outDir, 'score.json'), + cachePath: path.join(outDir, 'cache'), + runner + }); + expect(result).toMatchObject({ + status: 'completed', + mode: 'official_evaluator', + claimBearing: true + }); + expect(calls[0].command).toBe('python'); + expect(calls[0].args).toEqual( + expect.arrayContaining(['-m', 'contextbench.evaluate', '--gold', '--pred', '--out']) + ); + } finally { + rmSync(outDir, { recursive: true, force: true }); + } + }); + + it('writes diagnostic non-claim-bearing fallback metadata when the evaluator fails', async () => { + const outDir = tempDir(); + const runner: ContextBenchProcessRunner = async () => ({ + status: 1, + stdout: '', + stderr: 'No module named contextbench' + }); + try { + const result = await scoreWithOfficialEvaluatorFirst({ + goldPath: path.join(outDir, 'gold.parquet'), + predictionPath: path.join(outDir, 'trajectory.json'), + outputPath: path.join(outDir, 'score.json'), + runner + }); + expect(result).toMatchObject({ + status: 'judge_failed', + mode: 'diagnostic_fallback', + claimBearing: false, + fallbackReason: 'official_evaluator_failed' + }); + expect(result.stderr).toContain('No module named'); + } finally { + rmSync(outDir, { recursive: true, force: true }); + } + }); + + it('feeds schema-bound fact and evidence diagnostics into false-ready classification', () => { + const parsed = parseStructuredAnswer( + JSON.stringify({ + answer: 'only mentions alpha', + confidence: 'high', + evidence: [ + { file: 'src/alpha.ts', lineRange: { start: 1, end: 3 }, reason: 'alpha evidence' } + ], + filesReferenced: ['src/alpha.ts'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: true + }) + ); + expect(parsed.answer).not.toBeNull(); + if (!parsed.answer) return; + const diagnostics = runFactRecallDiagnostics(parsed.answer, { + requiredFacts: ['beta'], + requiredEvidenceFiles: ['src/beta.ts'] + }); + expect(diagnostics.missingRequiredFacts).toEqual(['beta']); + expect(diagnostics.missingEvidenceFiles).toEqual(['src/beta.ts']); + expect(diagnostics.unsupportedClaim).toBe(true); + expect(diagnostics.falseReady).toBe(true); + }); +}); diff --git a/tests/contextbench-trajectory.test.ts b/tests/contextbench-trajectory.test.ts new file mode 100644 index 0000000..df72968 --- /dev/null +++ b/tests/contextbench-trajectory.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from 'vitest'; +import { parseStructuredAnswer } from '../src/eval/contextbench-answer.js'; +import { + fullFileSpan, + normalizeContextBenchPath, + normalizeTrajectory +} from '../src/eval/contextbench-trajectory.js'; +import type { ContextBenchTaskIdentity } from '../src/eval/contextbench-types.js'; + +const task: Pick = { + instance_id: 'phase38-task', + repo_url: 'https://github.com/example/repo.git', + base_commit: '0123456789abcdef0123456789abcdef01234567' +}; + +describe('ContextBench trajectory normalization', () => { + it('normalizes absolute and Windows paths relative to repo root', () => { + expect(normalizeContextBenchPath('C:\\work\\repo\\src\\index.ts', 'C:/work/repo')).toBe( + 'src/index.ts' + ); + expect(normalizeContextBenchPath('./src/file.ts')).toBe('src/file.ts'); + }); + + it('marks file-only references as explicit full-file spans', () => { + expect(fullFileSpan()).toEqual({ start: 1, end: null, full_file: true }); + }); + + it('deduplicates predicted files while preserving explicit line spans', () => { + const parsed = parseStructuredAnswer( + JSON.stringify({ + answer: 'uses target file', + confidence: 'medium', + evidence: [ + { + file: 'C:/work/repo/src/a.ts', + lineRange: { start: 10, end: 12 }, + reason: 'line evidence' + }, + { file: 'src/a.ts', lineRange: { start: 20, end: 21 }, reason: 'second span' } + ], + filesReferenced: ['src/a.ts', 'src/b.ts'], + symbolsReferenced: [], + unsupportedClaims: [], + readyToEdit: false + }) + ); + expect(parsed.answer).not.toBeNull(); + if (!parsed.answer) return; + const trajectory = normalizeTrajectory({ + task, + answer: parsed.answer, + repoRoot: 'C:/work/repo' + }); + expect(trajectory).toMatchObject({ + instance_id: task.instance_id, + repo_url: task.repo_url, + commit: task.base_commit + }); + expect(trajectory.traj_data.pred_files).toEqual(['src/a.ts', 'src/b.ts']); + expect(trajectory.traj_data.pred_spans['src/a.ts']).toEqual([ + { start: 10, end: 12, full_file: false }, + { start: 20, end: 21, full_file: false } + ]); + expect(trajectory.traj_data.pred_spans['src/b.ts']).toEqual([ + { start: 1, end: null, full_file: true } + ]); + }); +}); diff --git a/tests/fixtures/contextbench-codebase-context-baseline-arms.json b/tests/fixtures/contextbench-codebase-context-baseline-arms.json new file mode 100644 index 0000000..48082fb --- /dev/null +++ b/tests/fixtures/contextbench-codebase-context-baseline-arms.json @@ -0,0 +1,49 @@ +{ + "name": "v2.4-contextbench-codebase-context-baseline-arms", + "protocolVersion": "contextbench-protocol-v1", + "phase": 40, + "claimBearing": false, + "denominatorPolicy": "Diagnostic codebase-context arms stay separate from required competitor denominators and cannot change frozen lane identities, tasks, qrels, budgets, thresholds, or public claims.", + "arms": [ + { + "baselineArmId": "codebase-context-current-map-search", + "laneId": "codebase-context", + "sourceIdentity": "current dirty checkout captured by Phase 40 snapshot", + "allowedToolSurfaces": ["map", "search_codebase"], + "versionOrSourceRef": "dirty-worktree HEAD plus tracked/untracked snapshot hash", + "setupCommand": "pnpm run build && node dist/index.js --version", + "claimBearing": false, + "failurePolicy": "record_terminal_diagnostic_failure" + }, + { + "baselineArmId": "codebase-context-current-search-only", + "laneId": "codebase-context", + "sourceIdentity": "current dirty checkout captured by Phase 40 snapshot", + "allowedToolSurfaces": ["search_codebase"], + "versionOrSourceRef": "dirty-worktree HEAD plus tracked/untracked snapshot hash", + "setupCommand": "pnpm run build && node dist/index.js --version", + "claimBearing": false, + "failurePolicy": "record_terminal_diagnostic_failure" + }, + { + "baselineArmId": "codebase-context-current-map-only", + "laneId": "codebase-context", + "sourceIdentity": "current dirty checkout captured by Phase 40 snapshot", + "allowedToolSurfaces": ["map"], + "versionOrSourceRef": "dirty-worktree HEAD plus tracked/untracked snapshot hash", + "setupCommand": "pnpm run build && node dist/index.js --version", + "claimBearing": false, + "failurePolicy": "record_terminal_diagnostic_failure" + }, + { + "baselineArmId": "codebase-context-v2.2.0-package-map-search", + "laneId": "codebase-context", + "sourceIdentity": "published package version 2.2.0 if locally runnable without product patches", + "allowedToolSurfaces": ["map", "search_codebase"], + "versionOrSourceRef": "npm:codebase-context@2.2.0", + "setupCommand": "npx codebase-context@2.2.0 --version", + "claimBearing": false, + "failurePolicy": "record_terminal_diagnostic_failure" + } + ] +} diff --git a/tests/fixtures/contextbench-lane-setup-evidence.json b/tests/fixtures/contextbench-lane-setup-evidence.json new file mode 100644 index 0000000..4e4af04 --- /dev/null +++ b/tests/fixtures/contextbench-lane-setup-evidence.json @@ -0,0 +1,147 @@ +{ + "name": "v2.4-contextbench-lane-setup-evidence", + "protocolVersion": "contextbench-protocol-v1", + "phase": 39, + "claimBearing": false, + "generatedOutputsPolicy": "Setup logs, downloaded tools, caches, and probes stay under ignored outputs/contextbench paths and are not Phase 40 baseline artifacts.", + "records": [ + { + "laneId": "raw-native", + "readinessStatus": "ready_for_phase40", + "docsUrl": "https://docs.anthropic.com/en/docs/claude-code/cli-reference", + "sourceUrl": "https://docs.anthropic.com/en/docs/claude-code/cli-reference", + "workingDirectory": "", + "platform": { "os": "win32", "shell": "pwsh", "runtime": "node>=18 plus Claude Code CLI" }, + "redactedEnvVars": [], + "commands": [ + { "kind": "setup", "command": "none", "cwd": "", "safeToRunAutomatically": true, "exitCode": 0, "status": "not_required", "durationMs": 0, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": "sha256:not-run-not-required" }, + { "kind": "index", "command": "none", "cwd": "", "safeToRunAutomatically": true, "exitCode": 0, "status": "not_required", "durationMs": 0, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": "sha256:not-run-not-required" }, + { "kind": "query", "command": "claude --print < prompt.json", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "version", "command": "claude --version", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null } + ], + "setupDurationMs": 0, + "indexDurationMs": 0, + "setupStatus": "not_required", + "indexStatus": "not_required", + "logReference": "outputs/contextbench/setup/raw-native/README.md", + "evidenceHash": "sha256:documentation-ready-raw-native", + "nextHumanAction": "Phase 40 must snapshot the exact Claude CLI version and runtime before baseline runs.", + "claimBearing": false + }, + { + "laneId": "codebase-context", + "readinessStatus": "ready_for_phase40", + "docsUrl": "https://github.com/PatrickSys/codebase-context#readme", + "sourceUrl": "https://github.com/PatrickSys/codebase-context", + "workingDirectory": "", + "platform": { "os": "win32", "shell": "pwsh", "runtime": "node>=18 pnpm>=10" }, + "redactedEnvVars": ["OPENAI_API_KEY"], + "commands": [ + { "kind": "setup", "command": "pnpm run build && node dist/index.js --version", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "index", "command": "npx codebase-context refresh_index ", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "query", "command": "claude --print < prompt.json with codebase-context as the only context tool", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "version", "command": "npx codebase-context --version", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null } + ], + "setupDurationMs": null, + "indexDurationMs": null, + "setupStatus": "ready", + "indexStatus": "ready", + "logReference": "outputs/contextbench/setup/codebase-context/setup-index.json", + "evidenceHash": "sha256:documentation-ready-codebase-context", + "nextHumanAction": "Phase 40 must measure setup/index duration separately and store raw logs before task execution.", + "claimBearing": false + }, + { + "laneId": "jcodemunch-repomapper", + "readinessStatus": "ready_for_phase40", + "docsUrl": "https://github.com/jgravelle/jcodemunch-mcp/blob/main/USER_GUIDE.md", + "sourceUrl": "https://github.com/jgravelle/jcodemunch-mcp", + "workingDirectory": "", + "platform": { "os": "win32", "shell": "pwsh", "runtime": "uv or pipx plus MCP stdio" }, + "redactedEnvVars": [], + "commands": [ + { "kind": "setup", "command": "uvx jcodemunch-mcp", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "index", "command": "MCP index_folder {\"path\":\"\",\"incremental\":false,\"use_ai_summaries\":false,\"follow_symlinks\":false}", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "query", "command": "MCP search_symbols {\"repo\":\"\",\"query\":\"\",\"max_results\":10,\"semantic\":false}", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "version", "command": "uvx jcodemunch-mcp --help", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null } + ], + "setupDurationMs": null, + "indexDurationMs": null, + "setupStatus": "ready", + "indexStatus": "ready", + "logReference": "outputs/contextbench/setup/jcodemunch-repomapper/setup-index.json", + "evidenceHash": "sha256:documentation-ready-jcodemunch-repomapper", + "nextHumanAction": "Phase 40 must run uvx in an isolated benchmark cache and capture MCP tool logs before baseline task execution.", + "claimBearing": false + }, + { + "laneId": "grepai", + "readinessStatus": "invasive_setup_blocked", + "docsUrl": "https://yoanbernabeu.github.io/grepai/commands/grepai_init/", + "sourceUrl": "https://yoanbernabeu.github.io/grepai/watch-guide/", + "workingDirectory": "", + "platform": { "os": "win32", "shell": "pwsh", "runtime": "grepai CLI plus local embedding provider" }, + "redactedEnvVars": ["OPENAI_API_KEY", "OPENROUTER_API_KEY", "OLLAMA_HOST"], + "commands": [ + { "kind": "setup", "command": "grepai init --yes --provider ollama --backend gob --model nomic-embed-text", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/grepai/setup.stdout.log", "stderrLogPath": "outputs/contextbench/setup/grepai/setup.stderr.log", "outputHash": null }, + { "kind": "index", "command": "grepai watch --no-ui", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/grepai/index.stdout.log", "stderrLogPath": "outputs/contextbench/setup/grepai/index.stderr.log", "outputHash": null }, + { "kind": "query", "command": "grepai search --json --compact", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "version", "command": "grepai version", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null } + ], + "setupDurationMs": null, + "indexDurationMs": null, + "setupStatus": "blocked", + "indexStatus": "blocked", + "logReference": "outputs/contextbench/setup/grepai/blocked-evidence.json", + "evidenceHash": "sha256:blocked-grepai-local-embedding-prerequisite", + "nextHumanAction": "Approve an isolated GrepAI binary install plus local Ollama/model setup, or keep this lane as terminal missing evidence.", + "claimBearing": false + }, + { + "laneId": "codebase-memory-mcp", + "readinessStatus": "invasive_setup_blocked", + "docsUrl": "https://github.com/DeusData/codebase-memory-mcp", + "sourceUrl": "https://github.com/DeusData/codebase-memory-mcp/releases/latest", + "workingDirectory": "", + "platform": { "os": "win32", "shell": "pwsh", "runtime": "static MCP binary" }, + "redactedEnvVars": ["CBM_CACHE_DIR", "CBM_DIAGNOSTICS", "CBM_DOWNLOAD_URL"], + "commands": [ + { "kind": "setup", "command": "download Windows x86_64 release archive under outputs/contextbench/tool-cache/codebase-memory-mcp and run the extracted binary without installer auto-config", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/codebase-memory-mcp/setup.stdout.log", "stderrLogPath": "outputs/contextbench/setup/codebase-memory-mcp/setup.stderr.log", "outputHash": null }, + { "kind": "index", "command": "MCP prompt/tool action: Index this project", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/codebase-memory-mcp/index.stdout.log", "stderrLogPath": "outputs/contextbench/setup/codebase-memory-mcp/index.stderr.log", "outputHash": null }, + { "kind": "query", "command": "MCP structural query through codebase-memory-mcp only", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "version", "command": "codebase-memory-mcp --version", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null } + ], + "setupDurationMs": null, + "indexDurationMs": null, + "setupStatus": "blocked", + "indexStatus": "blocked", + "logReference": "outputs/contextbench/setup/codebase-memory-mcp/blocked-evidence.json", + "evidenceHash": "sha256:blocked-codebase-memory-mcp-auto-config-installer", + "nextHumanAction": "Approve a sandboxed manual binary download path that does not run the auto-configuring installer, or keep this lane as terminal missing evidence.", + "claimBearing": false + }, + { + "laneId": "codegraphcontext", + "readinessStatus": "ready_for_phase40", + "docsUrl": "https://pypi.org/project/codegraphcontext/", + "sourceUrl": "https://github.com/CodeGraphContext/CodeGraphContext", + "workingDirectory": "", + "platform": { "os": "win32", "shell": "pwsh", "runtime": "python>=3.10 pip plus Kuzu embedded backend" }, + "redactedEnvVars": [], + "commands": [ + { "kind": "setup", "command": "python -m pip install --target outputs/contextbench/tool-cache/codegraphcontext codegraphcontext kuzu", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "index", "command": "cgc index ", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "query", "command": "cgc analyze callers ", "cwd": "", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }, + { "kind": "version", "command": "cgc help", "cwd": "", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null } + ], + "setupDurationMs": null, + "indexDurationMs": null, + "setupStatus": "ready", + "indexStatus": "ready", + "logReference": "outputs/contextbench/setup/codegraphcontext/setup-index.json", + "evidenceHash": "sha256:documentation-ready-codegraphcontext", + "nextHumanAction": "Phase 40 must install into an isolated benchmark cache, force the embedded Kuzu path where available, and capture setup/index logs before baseline task execution.", + "claimBearing": false + } + ] +} diff --git a/tests/fixtures/contextbench-lane-tool-cards.json b/tests/fixtures/contextbench-lane-tool-cards.json new file mode 100644 index 0000000..8c809eb --- /dev/null +++ b/tests/fixtures/contextbench-lane-tool-cards.json @@ -0,0 +1,203 @@ +{ + "name": "v2.4-contextbench-lane-tool-cards", + "protocolVersion": "contextbench-protocol-v1", + "frozenDate": "2026-04-27", + "cards": [ + { + "laneId": "raw-native", + "displayName": "Raw/native agent tools", + "phase38Status": "executable_fake_smoke_only", + "phase39Status": "ready_for_phase40", + "executableInPhase38": true, + "contextTools": ["native-agent-tools"], + "allowedTools": ["native-read", "native-search", "native-shell-readonly"], + "disallowedTools": [ + "codebase-context", + "jcodemunch-repomapper", + "grepai", + "codebase-memory-mcp", + "codegraphcontext" + ], + "setupCommand": "none", + "indexCommand": "none", + "queryCommand": "claude --print < prompt.json", + "versionCommand": "claude --version", + "cachePath": "outputs/contextbench/cache/raw-native/", + "artifactPaths": { + "setup": "setup-index.json", + "rawTrace": "raw-trace.json", + "structuredAnswer": "structured-answer.json", + "trajectory": "trajectory.json", + "score": "score.json" + }, + "setupCostReportedSeparately": true, + "indexCostReportedSeparately": true, + "claimBearing": false + }, + { + "laneId": "codebase-context", + "displayName": "codebase-context", + "phase38Status": "executable_fake_smoke_only", + "phase39Status": "ready_for_phase40", + "executableInPhase38": true, + "contextTools": ["codebase-context"], + "allowedTools": ["codebase-context"], + "disallowedTools": [ + "native-read", + "native-search", + "native-shell-readonly", + "jcodemunch-repomapper", + "grepai", + "codebase-memory-mcp", + "codegraphcontext" + ], + "setupCommand": "pnpm run build && node dist/index.js --version", + "indexCommand": "npx codebase-context refresh_index ", + "queryCommand": "claude --print < prompt.json", + "versionCommand": "npx codebase-context --version", + "cachePath": "outputs/contextbench/cache/codebase-context/", + "artifactPaths": { + "setup": "setup-index.json", + "rawTrace": "raw-trace.json", + "structuredAnswer": "structured-answer.json", + "trajectory": "trajectory.json", + "score": "score.json" + }, + "setupCostReportedSeparately": true, + "indexCostReportedSeparately": true, + "claimBearing": false + }, + { + "laneId": "jcodemunch-repomapper", + "displayName": "jCodeMunch RepoMapper", + "phase38Status": "pending_phase39_setup", + "phase39Status": "ready_for_phase40", + "executableInPhase38": false, + "contextTools": ["jcodemunch-repomapper"], + "allowedTools": ["jcodemunch-repomapper"], + "disallowedTools": [ + "native-read", + "native-search", + "native-shell-readonly", + "codebase-context", + "grepai", + "codebase-memory-mcp", + "codegraphcontext" + ], + "setupCommand": "uvx jcodemunch-mcp", + "indexCommand": "MCP index_folder {\"path\":\"\",\"incremental\":false,\"use_ai_summaries\":false,\"follow_symlinks\":false}", + "queryCommand": "MCP search_symbols {\"repo\":\"\",\"query\":\"\",\"max_results\":10,\"semantic\":false}", + "versionCommand": "uvx jcodemunch-mcp --help", + "cachePath": "outputs/contextbench/cache/jcodemunch-repomapper/", + "artifactPaths": { + "setup": "setup-index.json", + "rawTrace": "raw-trace.json", + "structuredAnswer": "structured-answer.json", + "trajectory": "trajectory.json", + "score": "score.json" + }, + "setupCostReportedSeparately": true, + "indexCostReportedSeparately": true, + "claimBearing": false + }, + { + "laneId": "grepai", + "displayName": "GrepAI", + "phase38Status": "pending_phase39_setup", + "phase39Status": "invasive_setup_blocked", + "executableInPhase38": false, + "contextTools": ["grepai"], + "allowedTools": ["grepai"], + "disallowedTools": [ + "native-read", + "native-search", + "native-shell-readonly", + "codebase-context", + "jcodemunch-repomapper", + "codebase-memory-mcp", + "codegraphcontext" + ], + "setupCommand": "grepai init --yes --provider ollama --backend gob --model nomic-embed-text", + "indexCommand": "grepai watch --no-ui", + "queryCommand": "grepai search --json --compact", + "versionCommand": "grepai version", + "cachePath": "outputs/contextbench/cache/grepai/", + "artifactPaths": { + "setup": "setup-index.json", + "rawTrace": "raw-trace.json", + "structuredAnswer": "structured-answer.json", + "trajectory": "trajectory.json", + "score": "score.json" + }, + "setupCostReportedSeparately": true, + "indexCostReportedSeparately": true, + "claimBearing": false + }, + { + "laneId": "codebase-memory-mcp", + "displayName": "codebase-memory-mcp", + "phase38Status": "pending_phase39_setup", + "phase39Status": "invasive_setup_blocked", + "executableInPhase38": false, + "contextTools": ["codebase-memory-mcp"], + "allowedTools": ["codebase-memory-mcp"], + "disallowedTools": [ + "native-read", + "native-search", + "native-shell-readonly", + "codebase-context", + "jcodemunch-repomapper", + "grepai", + "codegraphcontext" + ], + "setupCommand": "download Windows x86_64 release archive under outputs/contextbench/tool-cache/codebase-memory-mcp and run the extracted binary without installer auto-config", + "indexCommand": "MCP prompt/tool action: Index this project", + "queryCommand": "MCP structural query through codebase-memory-mcp only", + "versionCommand": "codebase-memory-mcp --version", + "cachePath": "outputs/contextbench/cache/codebase-memory-mcp/", + "artifactPaths": { + "setup": "setup-index.json", + "rawTrace": "raw-trace.json", + "structuredAnswer": "structured-answer.json", + "trajectory": "trajectory.json", + "score": "score.json" + }, + "setupCostReportedSeparately": true, + "indexCostReportedSeparately": true, + "claimBearing": false + }, + { + "laneId": "codegraphcontext", + "displayName": "CodeGraphContext", + "phase38Status": "pending_phase39_setup", + "phase39Status": "ready_for_phase40", + "executableInPhase38": false, + "contextTools": ["codegraphcontext"], + "allowedTools": ["codegraphcontext"], + "disallowedTools": [ + "native-read", + "native-search", + "native-shell-readonly", + "codebase-context", + "jcodemunch-repomapper", + "grepai", + "codebase-memory-mcp" + ], + "setupCommand": "python -m pip install --target outputs/contextbench/tool-cache/codegraphcontext codegraphcontext kuzu", + "indexCommand": "cgc index ", + "queryCommand": "cgc analyze callers ", + "versionCommand": "cgc help", + "cachePath": "outputs/contextbench/cache/codegraphcontext/", + "artifactPaths": { + "setup": "setup-index.json", + "rawTrace": "raw-trace.json", + "structuredAnswer": "structured-answer.json", + "trajectory": "trajectory.json", + "score": "score.json" + }, + "setupCostReportedSeparately": true, + "indexCostReportedSeparately": true, + "claimBearing": false + } + ] +} From b2fa208a4df0579bfdc41d8ffe2a74b2fae6e93e Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 18:06:53 +0200 Subject: [PATCH 02/11] fix(format): format ContextBench harness sources --- src/eval/contextbench-answer.ts | 12 ++++--- src/eval/contextbench-evidence-gate.ts | 49 ++++++++++++++++++++------ 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/src/eval/contextbench-answer.ts b/src/eval/contextbench-answer.ts index 6c9b55d..45d264f 100644 --- a/src/eval/contextbench-answer.ts +++ b/src/eval/contextbench-answer.ts @@ -103,11 +103,11 @@ function isJsonValue(value: unknown): value is JsonValue { export function isValidEvidenceReference(value: unknown): value is ContextBenchEvidenceReference { if (!isRecord(value)) return false; - if (findAdditionalFields(value, evidenceReferenceFields, 'evidence_field').length > 0) return false; + if (findAdditionalFields(value, evidenceReferenceFields, 'evidence_field').length > 0) + return false; const lineRange = value.lineRange; if (!isRecord(lineRange)) return false; - if (findAdditionalFields(lineRange, lineRangeFields, 'line_range_field').length > 0) - return false; + if (findAdditionalFields(lineRange, lineRangeFields, 'line_range_field').length > 0) return false; const start = lineRange.start; const end = lineRange.end; return ( @@ -134,7 +134,11 @@ function validateStructuredAnswer(value: unknown): StructuredAnswerParseResult { if (!(field in value)) errors.push(`missing_${field}`); } errors.push( - ...findAdditionalFields(value, new Set(CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS), 'root_field') + ...findAdditionalFields( + value, + new Set(CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS), + 'root_field' + ) ); if (!isJsonValue(value.answer)) errors.push('answer_not_json_value'); diff --git a/src/eval/contextbench-evidence-gate.ts b/src/eval/contextbench-evidence-gate.ts index 815616e..79b4f3c 100644 --- a/src/eval/contextbench-evidence-gate.ts +++ b/src/eval/contextbench-evidence-gate.ts @@ -187,8 +187,15 @@ function hasOfficialEvaluatorProof( ); } -function hasDiagnosticFallback(row: ContextBenchRunManifestRow, score: ContextBenchScoreEvidence | undefined): boolean { - return row.scoring.claimBearing === false || Boolean(row.scoring.fallbackReason) || score?.mode === 'diagnostic_fallback'; +function hasDiagnosticFallback( + row: ContextBenchRunManifestRow, + score: ContextBenchScoreEvidence | undefined +): boolean { + return ( + row.scoring.claimBearing === false || + Boolean(row.scoring.fallbackReason) || + score?.mode === 'diagnostic_fallback' + ); } function hasLaneIsolationProof( @@ -198,7 +205,8 @@ function hasLaneIsolationProof( ): boolean { if (!isolation?.proven) return false; if (!policy) return false; - if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) return false; + if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) + return false; if (policy.laneId !== row.lane_id) return false; if (isolation.laneId !== row.lane_id) return false; if (isolation.expectedContextTool !== policy.expectedContextTool) return false; @@ -219,7 +227,8 @@ function hasRunnerProvenance( rawTrace: ContextBenchRawTraceEvidence | undefined, expectedRunnerHash: string | undefined ): boolean { - if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash) return false; + if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash) + return false; return ( rawTrace.executor === row.taskExecution.executor && rawTrace.model === row.taskExecution.model && @@ -228,7 +237,9 @@ function hasRunnerProvenance( ); } -function rowKey(row: Pick): string { +function rowKey( + row: Pick +): string { return `${row.lane_id}\u0000${row.task_id}\u0000${row.repeat_index}`; } @@ -252,7 +263,11 @@ export function evaluateContextBenchEvidenceGate( }); } - if (input.expectedTotalRows <= 0 || input.requiredLaneIds.length === 0 || input.requiredTaskIds.length === 0) { + if ( + input.expectedTotalRows <= 0 || + input.requiredLaneIds.length === 0 || + input.requiredTaskIds.length === 0 + ) { failures.push({ code: 'denominator_contract_missing', message: 'Claim validation requires a frozen denominator contract.' @@ -289,7 +304,11 @@ export function evaluateContextBenchEvidenceGate( } if (row.protocol_hash !== input.expectedProtocolHash) { failures.push( - makeFailure(row, 'protocol_hash_mismatch', 'Row protocol hash does not match the frozen protocol hash.') + makeFailure( + row, + 'protocol_hash_mismatch', + 'Row protocol hash does not match the frozen protocol hash.' + ) ); } if (row.task_manifest_hash !== input.expectedTaskManifestHash) { @@ -351,7 +370,9 @@ export function evaluateContextBenchEvidenceGate( const artifacts = input.artifactsByRunId[row.run_id]; if (row.status !== 'completed') { - failures.push(makeFailure(row, 'non_completed_status', 'Claim-bearing runs must complete.')); + failures.push( + makeFailure(row, 'non_completed_status', 'Claim-bearing runs must complete.') + ); } if ( @@ -377,11 +398,15 @@ export function evaluateContextBenchEvidenceGate( ); } - if (!hasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])) { + if ( + !hasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id]) + ) { failures.push( makeFailure( row, - artifacts?.laneIsolation?.violations?.length ? 'lane_isolation_violation' : 'lane_isolation_missing', + artifacts?.laneIsolation?.violations?.length + ? 'lane_isolation_violation' + : 'lane_isolation_missing', 'Lane isolation must be proven by explicit allowed/observed tool evidence.' ) ); @@ -410,7 +435,9 @@ export function evaluateContextBenchEvidenceGate( } } - const blockingFailures = failures.filter((failure) => failure.code !== 'artifact_verification_missing'); + const blockingFailures = failures.filter( + (failure) => failure.code !== 'artifact_verification_missing' + ); const shapePass = blockingFailures.length === 0; const claimPass = failures.length === 0; return { From 6aed9d1a93f540f0d4a17142ab4527769b97cecb Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 18:25:08 +0200 Subject: [PATCH 03/11] fix(test): isolate ContextBench baseline Git env --- tests/contextbench-baseline-runner.test.ts | 61 +++++++++++++------ .../contextbench-baseline-schema-gate.test.ts | 26 +++++--- tests/contextbench-baseline-snapshot.test.ts | 4 ++ 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts index 41436fd..ab20304 100644 --- a/tests/contextbench-baseline-runner.test.ts +++ b/tests/contextbench-baseline-runner.test.ts @@ -39,6 +39,23 @@ type TaskManifest = { tasks: Array<{ instance_id: string }> }; const manifest = manifestFixture as TaskManifest; vi.setConfig({ testTimeout: 30000 }); +for (const key of Object.keys(process.env)) { + if (key.startsWith('GIT_')) delete process.env[key]; +} + +function childEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv { + const env: NodeJS.ProcessEnv = {}; + for (const [key, value] of Object.entries(process.env)) { + if (!key.startsWith('GIT_')) env[key] = value; + } + return { ...env, ...overrides }; +} + +function ignoreWindowsTempCleanupRace(error: unknown): void { + const code = (error as NodeJS.ErrnoException).code; + if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error; +} + function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { return path.join( mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-runner-`)), @@ -60,12 +77,12 @@ function createCleanGitRepo(root: string): string { const repoPath = path.join(root, 'repo'); mkdirSync(repoPath, { recursive: true }); writeFileSync(path.join(repoPath, 'README.md'), '# ContextBench fixture\n', 'utf8'); - execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' }); - execFileSync('git', ['add', 'README.md'], { cwd: repoPath, encoding: 'utf8' }); + execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8', env: childEnv() }); + execFileSync('git', ['add', 'README.md'], { cwd: repoPath, encoding: 'utf8', env: childEnv() }); execFileSync( 'git', ['-c', 'user.name=ContextBench Test', '-c', 'user.email=contextbench@example.invalid', 'commit', '-m', 'fixture'], - { cwd: repoPath, encoding: 'utf8' } + { cwd: repoPath, encoding: 'utf8', env: childEnv() } ); return repoPath; } @@ -141,7 +158,9 @@ describe('ContextBench Phase 40 baseline runner', () => { } finally { rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { recursive: true, - force: true + force: true, + maxRetries: 10, + retryDelay: 200 }); } }); @@ -222,14 +241,13 @@ describe('ContextBench Phase 40 baseline runner', () => { const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); const stubClaude = writeStubClaude(tempRoot); const stubEvaluator = writeStubEvaluator(tempRoot, 0); - const env = { - ...process.env, + const env = childEnv({ CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } }) - }; + }); try { execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { encoding: 'utf8', @@ -301,14 +319,13 @@ describe('ContextBench Phase 40 baseline runner', () => { const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); const stubClaude = writeStubClaude(tempRoot); const stubEvaluator = writeStubEvaluator(tempRoot, 0, 'not json'); - const env = { - ...process.env, + const env = childEnv({ CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } }) - }; + }); try { execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { encoding: 'utf8', @@ -374,14 +391,13 @@ describe('ContextBench Phase 40 baseline runner', () => { const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); const stubClaude = writeStubClaude(tempRoot); const stubEvaluator = writeStubEvaluator(tempRoot, 0, testCase.output); - const env = { - ...process.env, + const env = childEnv({ CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } }) - }; + }); try { execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { encoding: 'utf8', @@ -426,11 +442,10 @@ describe('ContextBench Phase 40 baseline runner', () => { const payloadPath = writePayloadFile(tempRoot, taskId, repoPath); const stubClaude = writeStubClaude(tempRoot); const stubEvaluator = writeStubEvaluator(tempRoot, 1); - const env = { - ...process.env, + const env = childEnv({ CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]) - }; + }); try { execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { encoding: 'utf8', @@ -1086,10 +1101,16 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(result.stdout).toContain('phase42 verification failed'); expect(result.stderr).toContain('baseline seal blocked by Phase 42 evidence gate'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + try { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true, + maxRetries: 10, + retryDelay: 200 + }); + } catch (error) { + ignoreWindowsTempCleanupRace(error); + } } }); }); diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts index a1b808d..34b3e92 100644 --- a/tests/contextbench-baseline-schema-gate.test.ts +++ b/tests/contextbench-baseline-schema-gate.test.ts @@ -24,6 +24,18 @@ type TaskManifest = { tasks: Array<{ instance_id: string; base_commit: string }> const manifest = manifestFixture as TaskManifest; vi.setConfig({ testTimeout: 30000 }); +for (const key of Object.keys(process.env)) { + if (key.startsWith('GIT_')) delete process.env[key]; +} + +function childEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv { + const env: NodeJS.ProcessEnv = {}; + for (const [key, value] of Object.entries(process.env)) { + if (!key.startsWith('GIT_')) env[key] = value; + } + return { ...env, ...overrides }; +} + function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { return path.join( mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)), @@ -73,15 +85,14 @@ function createClaudeStub( chmodSync(shellStub, 0o755); return { stubDir, - env: { - ...process.env, + env: childEnv({ PATH: `${stubDir}${path.delimiter}${process.env.PATH ?? ''}`, Path: `${stubDir}${path.delimiter}${process.env.Path ?? process.env.PATH ?? ''}`, CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubScript]), CLAUDE_STUB_STDOUT: stdout, CLAUDE_STUB_CWD_PATH: capture?.cwdPath, CLAUDE_STUB_STDIN_PATH: capture?.stdinPath - } + }) }; } @@ -99,7 +110,7 @@ function writeTaskPayloads( function createGitCheckout(): string { const repoPath = mkdtempSync(path.join(tmpdir(), 'contextbench-task-repo-')); - execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' }); + execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8', env: childEnv() }); execFileSync( 'git', [ @@ -112,7 +123,7 @@ function createGitCheckout(): string { '-m', 'init' ], - { cwd: repoPath, encoding: 'utf8' } + { cwd: repoPath, encoding: 'utf8', env: childEnv() } ); return repoPath; } @@ -163,8 +174,7 @@ function createAdapterStub( ); return { stubDir, - env: { - ...process.env, + env: childEnv({ [`CONTEXTBENCH_${executor.toUpperCase()}_COMMAND`]: JSON.stringify([ process.execPath, stubScript @@ -172,7 +182,7 @@ function createAdapterStub( ADAPTER_STUB_EXECUTOR: executor, ADAPTER_STUB_CWD_PATH: capture?.cwdPath, ADAPTER_STUB_ARGS_PATH: capture?.argsPath - } + }) }; } diff --git a/tests/contextbench-baseline-snapshot.test.ts b/tests/contextbench-baseline-snapshot.test.ts index 1061826..6ab133b 100644 --- a/tests/contextbench-baseline-snapshot.test.ts +++ b/tests/contextbench-baseline-snapshot.test.ts @@ -29,6 +29,10 @@ type BaselineSession = { vi.setConfig({ testTimeout: 30000 }); +for (const key of Object.keys(process.env)) { + if (key.startsWith('GIT_')) delete process.env[key]; +} + function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { return path.join( mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-`)), From 0360cb97d99337438e1922bf52a76833b9d20fd6 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 18:29:24 +0200 Subject: [PATCH 04/11] fix(test): tolerate ContextBench temp cleanup races --- .../contextbench-baseline-schema-gate.test.ts | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts index 34b3e92..9513a7b 100644 --- a/tests/contextbench-baseline-schema-gate.test.ts +++ b/tests/contextbench-baseline-schema-gate.test.ts @@ -36,6 +36,11 @@ function childEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv { return { ...env, ...overrides }; } +function ignoreWindowsTempCleanupRace(error: unknown): void { + const code = (error as NodeJS.ErrnoException).code; + if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error; +} + function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { return path.join( mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)), @@ -398,10 +403,16 @@ describe('ContextBench Phase 40 schema gate', () => { }; expect(trajectory.traj_data.pred_files).toContain('src/a.ts'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + try { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true, + maxRetries: 10, + retryDelay: 200 + }); + } catch (error) { + ignoreWindowsTempCleanupRace(error); + } rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); rmSync(stubDir, { recursive: true, force: true }); From cad646d9d940c00ab96baa0ca806070722cced32 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 18:47:38 +0200 Subject: [PATCH 05/11] fix(test): relax slow Windows search timeouts --- tests/search-decision-card.test.ts | 18 ++++++++++-------- tests/search-snippets.test.ts | 14 ++++++++------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/tests/search-decision-card.test.ts b/tests/search-decision-card.test.ts index d99b4c7..c6d77ae 100644 --- a/tests/search-decision-card.test.ts +++ b/tests/search-decision-card.test.ts @@ -40,6 +40,8 @@ type ToolCallResponse = { isError?: boolean; }; +const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000; + function getToolCallHandler( server: unknown ): (request: ToolCallRequest) => Promise { @@ -153,7 +155,7 @@ export class ProfileService { config: { skipEmbedding: true } }); await indexer.index(); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); afterEach(async () => { if (originalArgv) { @@ -170,7 +172,7 @@ export class ProfileService { await rmWithRetries(tempRoot); tempRoot = null; } - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('intent="edit" with multiple results returns full decision card with ready field', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -207,7 +209,7 @@ export class ProfileService { } expect(preflight.ready).toBeDefined(); expect(typeof preflight.ready).toBe('boolean'); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('decision card has all expected fields when returned', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -259,7 +261,7 @@ export class ProfileService { if (preflight.whatWouldHelp) { expect(Array.isArray(preflight.whatWouldHelp)).toBe(true); } - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('intent="explore" returns lightweight preflight', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -290,7 +292,7 @@ export class ProfileService { expect(typeof preflight.ready).toBe('boolean'); // Should NOT have full decision card fields for explore } - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('includes snippet field when includeSnippets=true', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -321,7 +323,7 @@ export class ProfileService { // At least some results should have a snippet const withSnippets = parsed.results.filter((result) => result.snippet); expect(withSnippets.length).toBeGreaterThan(0); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('does not include snippet field when includeSnippets=false', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -350,7 +352,7 @@ export class ProfileService { parsed.results.forEach((result) => { expect(result.snippet).toBeUndefined(); }); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('scope header starts snippet when includeSnippets=true', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -381,5 +383,5 @@ export class ProfileService { const firstLine = withSnippet.snippet.split('\n')[0].trim(); expect(firstLine).toMatch(/^\/\//); } - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); }); diff --git a/tests/search-snippets.test.ts b/tests/search-snippets.test.ts index 4b387ed..d5cf634 100644 --- a/tests/search-snippets.test.ts +++ b/tests/search-snippets.test.ts @@ -11,6 +11,8 @@ vi.mock('../src/core/reranker.js', () => ({ isAmbiguous: vi.fn(() => false) })); +const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000; + describe('Search Snippets with Scope Headers', () => { let tempRoot: string | null = null; @@ -98,7 +100,7 @@ export const VERSION = '1.0.0'; config: { skipEmbedding: true } }); await indexer.index(); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); afterEach(async () => { if (tempRoot) { @@ -106,7 +108,7 @@ export const VERSION = '1.0.0'; tempRoot = null; } delete process.env.CODEBASE_ROOT; - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('returns snippets when includeSnippets=true', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -136,7 +138,7 @@ export const VERSION = '1.0.0'; const withSnippets = parsed.results.filter((r: any) => r.snippet); expect(withSnippets.length).toBeGreaterThan(0); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('scope header is a comment line starting with //', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -167,7 +169,7 @@ export const VERSION = '1.0.0'; // Scope header should be a comment line expect(firstLine).toMatch(/^\/\//); } - }); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('does not include snippet when includeSnippets=false', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -195,7 +197,7 @@ export const VERSION = '1.0.0'; parsed.results.forEach((r: any) => { expect(r.snippet).toBeUndefined(); }); - }); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('snippet is a string starting with code or comment', async () => { if (!tempRoot) throw new Error('tempRoot not initialized'); @@ -225,5 +227,5 @@ export const VERSION = '1.0.0'; expect(typeof withSnippet.snippet).toBe('string'); expect(withSnippet.snippet.length).toBeGreaterThan(0); } - }); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); }); From 45139796f4e0cc51854de906b0b40b66beb8b4e3 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 20:08:08 +0200 Subject: [PATCH 06/11] fix(eval): align ContextBench harness evidence contracts --- scripts/contextbench-runner.mjs | 8 +++++- src/eval/contextbench-artifacts.ts | 13 ++------- src/eval/contextbench-scoring.ts | 20 ++++++++++++-- tests/contextbench-baseline-runner.test.ts | 9 +++--- tests/contextbench-runner-contract.test.ts | 13 +++++++++ tests/contextbench-scoring.test.ts | 32 +++++++++++++++++++++- tests/impact-2hop.test.ts | 4 ++- tests/search-compact-mode.test.ts | 4 ++- 8 files changed, 83 insertions(+), 20 deletions(-) diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs index 11285f4..658af9f 100644 --- a/scripts/contextbench-runner.mjs +++ b/scripts/contextbench-runner.mjs @@ -995,6 +995,12 @@ function laneTelemetryOverrides() { function buildLaneIsolationEvidence(laneCard) { const telemetry = laneTelemetryOverrides()[laneCard.laneId]; + const acceptedSourceKinds = new Set(['not_captured', 'env_override', 'transcript', 'proxy']); + const sourceKind = acceptedSourceKinds.has(telemetry?.sourceKind) + ? telemetry.sourceKind + : telemetry?.proofSource + ? 'env_override' + : 'not_captured'; const observedTools = Array.isArray(telemetry?.observedTools) ? telemetry.observedTools.filter((tool) => typeof tool === 'string') : []; @@ -1010,7 +1016,7 @@ function buildLaneIsolationEvidence(laneCard) { return { laneId: laneCard.laneId, proven, - sourceKind: telemetry?.proofSource ? 'env_override' : 'not_captured', + sourceKind, proofSource: typeof telemetry?.proofSource === 'string' ? telemetry.proofSource : 'not_captured', expectedContextTool, allowedTools: laneCard.allowedTools, diff --git a/src/eval/contextbench-artifacts.ts b/src/eval/contextbench-artifacts.ts index e888b34..bf13a14 100644 --- a/src/eval/contextbench-artifacts.ts +++ b/src/eval/contextbench-artifacts.ts @@ -7,6 +7,7 @@ import type { ContextBenchLaneSetupEvidenceRecord, ContextBenchLaneToolCard, ContextBenchRunManifestRow, + ContextBenchSetupIndexMetadata, ContextBenchTerminalStatus, ContextBenchTaskIdentity } from './contextbench-types.js'; @@ -125,6 +126,7 @@ export function buildManifestRow(params: { startedAt: string; completedAt: string; paths: ArtifactPathSet; + setupIndex: ContextBenchSetupIndexMetadata; hashes: Record; executor: ContextBenchExecutor; model: string; @@ -150,16 +152,7 @@ export function buildManifestRow(params: { setup_index_path: params.paths.setupIndexPath, prompt_path: params.paths.promptPath, lane_tool_card_path: params.paths.laneToolCardPath, - setupIndex: { - setupCommand: params.laneCard.setupCommand, - indexCommand: params.laneCard.indexCommand, - setupDurationMs: 0, - indexDurationMs: 0, - setupLogPath: params.paths.setupIndexPath, - indexLogPath: params.paths.setupIndexPath, - setupStatus: params.laneCard.setupCommand === 'none' ? 'not_required' : 'completed', - indexStatus: params.laneCard.indexCommand === 'none' ? 'not_required' : 'completed' - }, + setupIndex: params.setupIndex, taskExecution: { model: params.model, timeoutSeconds: params.timeoutSeconds, diff --git a/src/eval/contextbench-scoring.ts b/src/eval/contextbench-scoring.ts index 8df61fb..e20c0f8 100644 --- a/src/eval/contextbench-scoring.ts +++ b/src/eval/contextbench-scoring.ts @@ -21,6 +21,7 @@ export interface OfficialEvaluatorParams { outputPath: string; cachePath?: string; cwd?: string; + claimAllowed?: boolean; runner: ContextBenchProcessRunner; } @@ -32,6 +33,11 @@ export interface ContextBenchScoreResult { stdout: string; stderr: string; exitStatus: number | null; + exitCode: number | null; + officialEvaluatorFirst: boolean; + officialEvaluatorAttempted: boolean; + officialEvaluatorInvoked: boolean; + outputPath: string; fallbackReason?: string; } @@ -67,11 +73,16 @@ export async function scoreWithOfficialEvaluatorFirst( const score = { status: 'completed' as const, mode: 'official_evaluator' as const, - claimBearing: true, + claimBearing: params.claimAllowed === true, command, stdout: result.stdout, stderr: result.stderr, - exitStatus: result.status + exitStatus: result.status, + exitCode: result.status, + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true, + outputPath: params.outputPath }; writeJson(params.outputPath, score); return score; @@ -85,6 +96,11 @@ export async function scoreWithOfficialEvaluatorFirst( stdout: result.stdout, stderr: result.stderr, exitStatus: result.status, + exitCode: result.status, + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true, + outputPath: params.outputPath, fallbackReason: 'official_evaluator_failed' }; writeJson(params.outputPath, score); diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts index ab20304..e30557e 100644 --- a/tests/contextbench-baseline-runner.test.ts +++ b/tests/contextbench-baseline-runner.test.ts @@ -245,7 +245,7 @@ describe('ContextBench Phase 40 baseline runner', () => { CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ - 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } + 'raw-native': { sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } }) }); try { @@ -299,10 +299,11 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(score.stdoutPath).toBeTruthy(); expect(score.stderrPath).toBeTruthy(); const rawTrace = JSON.parse(readFileSync(attempt?.raw_trace_path ?? '', 'utf8')) as { - laneIsolation?: { proven: boolean; proofSource: string; observedTools: string[] }; + laneIsolation?: { proven: boolean; sourceKind: string; proofSource: string; observedTools: string[] }; }; expect(rawTrace.laneIsolation).toMatchObject({ proven: true, + sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }); @@ -323,7 +324,7 @@ describe('ContextBench Phase 40 baseline runner', () => { CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ - 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } + 'raw-native': { sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } }) }); try { @@ -395,7 +396,7 @@ describe('ContextBench Phase 40 baseline runner', () => { CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]), CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]), CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({ - 'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } + 'raw-native': { sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] } }) }); try { diff --git a/tests/contextbench-runner-contract.test.ts b/tests/contextbench-runner-contract.test.ts index 76455ab..930afff 100644 --- a/tests/contextbench-runner-contract.test.ts +++ b/tests/contextbench-runner-contract.test.ts @@ -218,6 +218,16 @@ describe('ContextBench Phase 38 runner contract', () => { writeJsonArtifact(paths.structuredAnswerPath, { answer: 'x' }); writeJsonArtifact(paths.trajectoryPath, { pred_files: [] }); writeJsonArtifact(paths.scorePath, { claimBearing: false }); + const setupIndex = { + setupCommand: laneCard.setupCommand, + indexCommand: laneCard.indexCommand, + setupDurationMs: 12, + indexDurationMs: 34, + setupLogPath: paths.setupIndexPath, + indexLogPath: paths.setupIndexPath, + setupStatus: 'not_required' as const, + indexStatus: 'not_required' as const + }; const row = buildManifestRow({ runId, protocolVersion: protocol.protocolVersion, @@ -230,6 +240,7 @@ describe('ContextBench Phase 38 runner contract', () => { startedAt: '2026-04-27T00:00:00.000Z', completedAt: '2026-04-27T00:00:01.000Z', paths, + setupIndex, hashes: { protocol: hashJson(protocol) }, executor: 'fake', model: 'fake-executor', @@ -247,6 +258,8 @@ describe('ContextBench Phase 38 runner contract', () => { expect(rows).toHaveLength(2); expect(rows[1].status).toBe('invalid_schema'); expect(rows[0].setupIndex.setupCommand).toBe(laneCard.setupCommand); + expect(rows[0].setupIndex.setupDurationMs).toBe(12); + expect(rows[0].setupIndex.indexDurationMs).toBe(34); } finally { rmSync(outDir, { recursive: true, force: true }); } diff --git a/tests/contextbench-scoring.test.ts b/tests/contextbench-scoring.test.ts index 5e4c2a6..60bab4d 100644 --- a/tests/contextbench-scoring.test.ts +++ b/tests/contextbench-scoring.test.ts @@ -27,12 +27,17 @@ describe('ContextBench official-evaluator-first scoring', () => { predictionPath: path.join(outDir, 'trajectory.json'), outputPath: path.join(outDir, 'score.json'), cachePath: path.join(outDir, 'cache'), + claimAllowed: true, runner }); expect(result).toMatchObject({ status: 'completed', mode: 'official_evaluator', - claimBearing: true + claimBearing: true, + officialEvaluatorFirst: true, + officialEvaluatorAttempted: true, + officialEvaluatorInvoked: true, + exitCode: 0 }); expect(calls[0].command).toBe('python'); expect(calls[0].args).toEqual( @@ -43,6 +48,29 @@ describe('ContextBench official-evaluator-first scoring', () => { } }); + it('does not mark successful official evaluator output claim-bearing without protocol permission', async () => { + const outDir = tempDir(); + const runner: ContextBenchProcessRunner = async () => ({ status: 0, stdout: 'ok', stderr: '' }); + try { + const result = await scoreWithOfficialEvaluatorFirst({ + goldPath: path.join(outDir, 'gold.parquet'), + predictionPath: path.join(outDir, 'trajectory.json'), + outputPath: path.join(outDir, 'score.json'), + claimAllowed: false, + runner + }); + expect(result).toMatchObject({ + status: 'completed', + mode: 'official_evaluator', + claimBearing: false, + officialEvaluatorInvoked: true, + exitCode: 0 + }); + } finally { + rmSync(outDir, { recursive: true, force: true }); + } + }); + it('writes diagnostic non-claim-bearing fallback metadata when the evaluator fails', async () => { const outDir = tempDir(); const runner: ContextBenchProcessRunner = async () => ({ @@ -61,6 +89,8 @@ describe('ContextBench official-evaluator-first scoring', () => { status: 'judge_failed', mode: 'diagnostic_fallback', claimBearing: false, + officialEvaluatorInvoked: true, + exitCode: 1, fallbackReason: 'official_evaluator_failed' }); expect(result.stderr).toContain('No module named'); diff --git a/tests/impact-2hop.test.ts b/tests/impact-2hop.test.ts index cf1f84f..010499e 100644 --- a/tests/impact-2hop.test.ts +++ b/tests/impact-2hop.test.ts @@ -15,6 +15,8 @@ import { RELATIONSHIPS_FILENAME } from '../src/constants/codebase-context.js'; +const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000; + vi.mock('../src/core/reranker.js', () => ({ rerank: vi.fn(async (_query: string, results: unknown) => results), getRerankerStatus: vi.fn(() => 'fallback'), @@ -127,5 +129,5 @@ describe('Impact candidates (2-hop)', () => { `Expected hop 2 candidate src/a.ts, got impact.details=${JSON.stringify(details)}` ); } - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); }); diff --git a/tests/search-compact-mode.test.ts b/tests/search-compact-mode.test.ts index 92f0327..c4d573c 100644 --- a/tests/search-compact-mode.test.ts +++ b/tests/search-compact-mode.test.ts @@ -50,6 +50,8 @@ function parseSearchResponse(text: string): SearchResponse { return JSON.parse(text) as SearchResponse; } +const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000; + describe('search_codebase compact/full mode', () => { let tempRoot: string | null = null; let originalArgv: string[] | null = null; @@ -572,7 +574,7 @@ describe('search_codebase compact/full mode', () => { expect(results[0].filePath).toBe(actualChunk.filePath); expect(results[0].imports).toEqual(actualChunk.imports); expect(results[0].exports).toEqual(actualChunk.exports); - }, 30000); + }, SLOW_WINDOWS_TEST_TIMEOUT_MS); it('adds a warning only when the final full payload exceeds the compact budget threshold', async () => { const oversizedSummary = 'Token-heavy summary '.repeat(1200); From a155d5646dbb283ffac1e71eef7fb26b8a59fa40 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 20:13:07 +0200 Subject: [PATCH 07/11] fix(test): tolerate ContextBench schema cleanup races --- tests/contextbench-baseline-schema-gate.test.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts index 9513a7b..d5d91bf 100644 --- a/tests/contextbench-baseline-schema-gate.test.ts +++ b/tests/contextbench-baseline-schema-gate.test.ts @@ -475,10 +475,16 @@ describe('ContextBench Phase 40 schema gate', () => { expect.arrayContaining(['additional_root_field_unexpectedRoot']) ); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + try { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true, + maxRetries: 10, + retryDelay: 200 + }); + } catch (error) { + ignoreWindowsTempCleanupRace(error); + } rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); rmSync(stubDir, { recursive: true, force: true }); From c027703092a81c90b5c19371873858e5a87ec00c Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 20:21:18 +0200 Subject: [PATCH 08/11] fix(test): tolerate ContextBench runner cleanup races --- tests/contextbench-baseline-runner.test.ts | 66 ++++++++-------------- 1 file changed, 22 insertions(+), 44 deletions(-) diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts index e30557e..76b7264 100644 --- a/tests/contextbench-baseline-runner.test.ts +++ b/tests/contextbench-baseline-runner.test.ts @@ -56,6 +56,19 @@ function ignoreWindowsTempCleanupRace(error: unknown): void { if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error; } +function cleanupSessionRoot(sessionRoot: string): void { + try { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true, + maxRetries: 10, + retryDelay: 200 + }); + } catch (error) { + ignoreWindowsTempCleanupRace(error); + } +} + function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { return path.join( mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-runner-`)), @@ -156,12 +169,7 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(rows.every((row) => row.scoring.officialEvaluatorInvoked === false)).toBe(true); expect(rows.every((row) => !('taskWallTimeMs' in row.setupIndex))).toBe(true); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true, - maxRetries: 10, - retryDelay: 200 - }); + cleanupSessionRoot(sessionRoot); } }); @@ -226,10 +234,7 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(rawTrace.scriptedAgentDecisions).toBe(false); expect(rawTrace.antiScriptingBoundary).toEqual(expect.arrayContaining(['file_selection'])); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -553,10 +558,7 @@ describe('ContextBench Phase 40 baseline runner', () => { ) as { phase: number }; expect(session.phase).toBe(41); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -617,10 +619,7 @@ describe('ContextBench Phase 40 baseline runner', () => { setupLogPath: measurement.setupLogPath }); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -656,10 +655,7 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement'); expect(attempt?.setupIndex.setupStatus).toBe('setup_failed'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -740,10 +736,7 @@ describe('ContextBench Phase 40 baseline runner', () => { indexLogPath }); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -883,10 +876,7 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(attempt?.status).toBe('setup_failed'); expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -936,10 +926,7 @@ describe('ContextBench Phase 40 baseline runner', () => { ) as { reservations: unknown[] }; expect(reservations.reservations).toHaveLength(20 * 6 * 3); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -1102,16 +1089,7 @@ describe('ContextBench Phase 40 baseline runner', () => { expect(result.stdout).toContain('phase42 verification failed'); expect(result.stderr).toContain('baseline seal blocked by Phase 42 evidence gate'); } finally { - try { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true, - maxRetries: 10, - retryDelay: 200 - }); - } catch (error) { - ignoreWindowsTempCleanupRace(error); - } + cleanupSessionRoot(sessionRoot); } }); }); From 5a5bf68302745f90b1dbdfba3ab06cfff961d4d5 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 20:24:56 +0200 Subject: [PATCH 09/11] fix(test): relax zombie guard timeout jitter --- tests/zombie-guard.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zombie-guard.test.ts b/tests/zombie-guard.test.ts index f286d6e..07be992 100644 --- a/tests/zombie-guard.test.ts +++ b/tests/zombie-guard.test.ts @@ -170,8 +170,8 @@ describe('zombie process prevention', () => { expect(result.code).toBe(1); // Should still honor a short timeout (allow CI/Windows process jitter). expect(elapsed).toBeGreaterThan(800); - expect(elapsed).toBeLessThan(8_000); - }, 12_000); + expect(elapsed).toBeLessThan(12_000); + }, 15_000); it('exits after post-initialize idle timeout when the client stays silent', async () => { const rootPath = createIdleTestProjectRoot(); From 867ac700d98ad141ee180f6353784f9dab1f26fc Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 21:04:04 +0200 Subject: [PATCH 10/11] fix(eval): preserve ContextBench executor model provenance --- scripts/contextbench-runner.mjs | 4 ++-- tests/contextbench-baseline-schema-gate.test.ts | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs index 658af9f..332542c 100644 --- a/scripts/contextbench-runner.mjs +++ b/scripts/contextbench-runner.mjs @@ -1987,7 +1987,7 @@ function runOneBaselineAttempt( }; const rawTrace = { executor, - model: executor === 'claude' ? model : 'fake-executor', + model: executor === 'fake' ? 'fake-executor' : model, runnerHash: runnerSourceHash(), claimBearing: false, stdout, @@ -2481,7 +2481,7 @@ function runOneCodebaseContextArmAttempt( writeJson(paths.setupIndex, { ...setupIndex, diagnosticBaselineArm: arm }); writeJson(paths.rawTrace, { executor, - model: executor === 'claude' ? model : 'fake-executor', + model: executor === 'fake' ? 'fake-executor' : model, runnerHash: runnerSourceHash(), claimBearing: false, baselineArmId: arm.baselineArmId, diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts index d5d91bf..9fa3330 100644 --- a/tests/contextbench-baseline-schema-gate.test.ts +++ b/tests/contextbench-baseline-schema-gate.test.ts @@ -17,6 +17,7 @@ type ManifestRow = { structured_answer_path: string; trajectory_path: string; scoring: { claimBearing: boolean }; + taskExecution: { model: string; executor: string }; }; type TaskManifest = { tasks: Array<{ instance_id: string; base_commit: string }> }; @@ -609,11 +610,15 @@ describe('ContextBench Phase 40 schema gate', () => { expect(row.status).toBe('completed'); const rawTrace = JSON.parse(readFileSync(row.raw_trace_path, 'utf8')) as { executor: string; + model: string; executorSchemaMode: string; executorArgs: string[]; taskContext: { materialized: boolean; verificationStrict: boolean }; structuredAnswerParseErrors: string[]; }; + expect(rawTrace.model).toBe('stub'); + expect(rawTrace.model).toBe(row.taskExecution.model); + expect(rawTrace.executor).toBe(row.taskExecution.executor); expect(rawTrace.taskContext).toMatchObject({ materialized: true, verificationStrict: false From c5a74afb64c65b255a363e31974fa7be6d58242d Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Wed, 29 Apr 2026 21:14:49 +0200 Subject: [PATCH 11/11] fix(test): harden ContextBench schema cleanup --- .../contextbench-baseline-schema-gate.test.ts | 65 ++++++------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts index 9fa3330..ad6fc0a 100644 --- a/tests/contextbench-baseline-schema-gate.test.ts +++ b/tests/contextbench-baseline-schema-gate.test.ts @@ -42,6 +42,19 @@ function ignoreWindowsTempCleanupRace(error: unknown): void { if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error; } +function cleanupSessionRoot(sessionRoot: string): void { + try { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true, + maxRetries: 10, + retryDelay: 200 + }); + } catch (error) { + ignoreWindowsTempCleanupRace(error); + } +} + function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { return path.join( mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)), @@ -298,10 +311,7 @@ describe('ContextBench Phase 40 schema gate', () => { }; expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -404,16 +414,7 @@ describe('ContextBench Phase 40 schema gate', () => { }; expect(trajectory.traj_data.pred_files).toContain('src/a.ts'); } finally { - try { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true, - maxRetries: 10, - retryDelay: 200 - }); - } catch (error) { - ignoreWindowsTempCleanupRace(error); - } + cleanupSessionRoot(sessionRoot); rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); rmSync(stubDir, { recursive: true, force: true }); @@ -476,16 +477,7 @@ describe('ContextBench Phase 40 schema gate', () => { expect.arrayContaining(['additional_root_field_unexpectedRoot']) ); } finally { - try { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true, - maxRetries: 10, - retryDelay: 200 - }); - } catch (error) { - ignoreWindowsTempCleanupRace(error); - } + cleanupSessionRoot(sessionRoot); rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); rmSync(stubDir, { recursive: true, force: true }); @@ -539,10 +531,7 @@ describe('ContextBench Phase 40 schema gate', () => { ]) ); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); } }); @@ -632,10 +621,7 @@ describe('ContextBench Phase 40 schema gate', () => { } } } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); for (const stubDir of stubs) rmSync(stubDir, { recursive: true, force: true }); @@ -707,10 +693,7 @@ describe('ContextBench Phase 40 schema gate', () => { repoCheckoutPath: repoPath }); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); rmSync(stubDir, { recursive: true, force: true }); @@ -877,10 +860,7 @@ describe('ContextBench Phase 40 schema gate', () => { expect(rawTrace.taskContext.errors).toContain('repo_checkout_dirty'); expect(rawTrace.taskContext.statusShort).toContain('dirty.txt'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); rmSync(payloadDir, { recursive: true, force: true }); rmSync(dirtyRepo, { recursive: true, force: true }); } @@ -953,10 +933,7 @@ describe('ContextBench Phase 40 schema gate', () => { }; expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer'); } finally { - rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { - recursive: true, - force: true - }); + cleanupSessionRoot(sessionRoot); rmSync(repoPath, { recursive: true, force: true }); rmSync(payloadDir, { recursive: true, force: true }); rmSync(stubDir, { recursive: true, force: true });