From ffa7e7313fcf466b7467f9ff21331d37fc9e0c32 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 18:06:03 +0200
Subject: [PATCH 01/11] test(eval): add ContextBench harness core

---
 scripts/contextbench-retrieval-gate.mjs       | 1249 ++++++
 scripts/contextbench-runner.mjs               | 3586 +++++++++++++++++
 src/eval/contextbench-answer.ts               |  229 ++
 src/eval/contextbench-artifacts.ts            |  184 +
 src/eval/contextbench-evidence-gate.ts        |  422 ++
 src/eval/contextbench-scoring.ts              |  107 +
 src/eval/contextbench-trajectory.ts           |   77 +
 src/eval/contextbench-types.ts                |  434 ++
 tests/contextbench-baseline-runner.test.ts    | 1095 +++++
 .../contextbench-baseline-schema-gate.test.ts |  944 +++++
 tests/contextbench-baseline-snapshot.test.ts  |  133 +
 tests/contextbench-lane-setup.test.ts         |  156 +
 ...contextbench-phase42-evidence-gate.test.ts |  372 ++
 tests/contextbench-runner-contract.test.ts    |  321 ++
 tests/contextbench-scoring.test.ts            |   97 +
 tests/contextbench-trajectory.test.ts         |   68 +
 ...tbench-codebase-context-baseline-arms.json |   49 +
 .../contextbench-lane-setup-evidence.json     |  147 +
 .../contextbench-lane-tool-cards.json         |  203 +
 19 files changed, 9873 insertions(+)
 create mode 100644 scripts/contextbench-retrieval-gate.mjs
 create mode 100644 scripts/contextbench-runner.mjs
 create mode 100644 src/eval/contextbench-answer.ts
 create mode 100644 src/eval/contextbench-artifacts.ts
 create mode 100644 src/eval/contextbench-evidence-gate.ts
 create mode 100644 src/eval/contextbench-scoring.ts
 create mode 100644 src/eval/contextbench-trajectory.ts
 create mode 100644 src/eval/contextbench-types.ts
 create mode 100644 tests/contextbench-baseline-runner.test.ts
 create mode 100644 tests/contextbench-baseline-schema-gate.test.ts
 create mode 100644 tests/contextbench-baseline-snapshot.test.ts
 create mode 100644 tests/contextbench-lane-setup.test.ts
 create mode 100644 tests/contextbench-phase42-evidence-gate.test.ts
 create mode 100644 tests/contextbench-runner-contract.test.ts
 create mode 100644 tests/contextbench-scoring.test.ts
 create mode 100644 tests/contextbench-trajectory.test.ts
 create mode 100644 tests/fixtures/contextbench-codebase-context-baseline-arms.json
 create mode 100644 tests/fixtures/contextbench-lane-setup-evidence.json
 create mode 100644 tests/fixtures/contextbench-lane-tool-cards.json

diff --git a/scripts/contextbench-retrieval-gate.mjs b/scripts/contextbench-retrieval-gate.mjs
new file mode 100644
index 0000000..d81b9e5
--- /dev/null
+++ b/scripts/contextbench-retrieval-gate.mjs
@@ -0,0 +1,1249 @@
+#!/usr/bin/env node
+import { createHash } from 'node:crypto';
+import { spawn, spawnSync } from 'node:child_process';
+import {
+  appendFileSync,
+  existsSync,
+  mkdirSync,
+  readdirSync,
+  readFileSync,
+  statSync,
+  writeFileSync
+} from 'node:fs';
+import { dirname, extname, isAbsolute, join, relative, resolve } from 'node:path';
+
+const DEFAULT_PAYLOADS =
+  'benchmark-runs/contextbench/phase40/task-payloads/contextbench-phase40-task-payloads.json';
+const DEFAULT_TASK_ID = 'Multi-SWE-Bench__c__maintenance__bugfix__5e659108';
+const DEFAULT_GOLD =
+  'benchmark-runs/contextbench/phase40/scoring-inputs/Multi-SWE-Bench__c__maintenance__bugfix__5e659108-gold.json';
+const DEFAULT_LANES = ['raw-native', 'codebase-context'];
+const TEXT_EXTENSIONS = new Set([
+  '.c',
+  '.cc',
+  '.cpp',
+  '.cxx',
+  '.h',
+  '.hpp',
+  '.hh',
+  '.go',
+  '.java',
+  '.js',
+  '.jsx',
+  '.ts',
+  '.tsx',
+  '.py',
+  '.rb',
+  '.rs',
+  '.php',
+  '.swift',
+  '.kt',
+  '.scala',
+  '.cs',
+  '.m',
+  '.mm',
+  '.pony',
+  '.md',
+  '.txt',
+  '.json',
+  '.yaml',
+  '.yml',
+  '.toml',
+  '.xml',
+  '.html',
+  '.css',
+  '.scss',
+  '.sql',
+  '.sh',
+  '.bat',
+  '.ps1'
+]);
+const EXCLUDED_DIRS = new Set([
+  '.git',
+  '.hg',
+  '.svn',
+  '.codebase-context',
+  'node_modules',
+  'vendor',
+  'dist',
+  'build',
+  'target',
+  '__pycache__',
+  '.pytest_cache',
+  '.mypy_cache',
+  '.gradle',
+  '.idea',
+  '.vscode'
+]);
+
+function help() {
+  console.log(`ContextBench retrieval-only diagnostic gate
+
+Usage:
+  node scripts/contextbench-retrieval-gate.mjs --out benchmark-runs/contextbench/phase40/<session_id>
+  node scripts/contextbench-retrieval-gate.mjs --out <dir> --task-id <id> --lanes raw-native,codebase-context,jcodemunch-repomapper,codegraphcontext --score
+
+Options:
+  --out <dir>             Required output session under benchmark-runs/contextbench/phase40/.
+  --task-payloads <file>  Materialized task payloads JSON. Defaults to Phase 40 payloads.
+  --task-id <id>          Frozen ContextBench instance id. Defaults to the first Phase 40 task.
+  --gold <file>           Scorer-only gold JSON. Used only after trajectory artifacts are written.
+  --lanes <csv>           Lanes to run. Supported: raw-native, codebase-context, jcodemunch-repomapper, codegraphcontext.
+  --limit <n>             Max retrieved files per lane. Default: 6.
+  --window <n>            Line window around lexical hits or parsed result spans. Default: 40.
+  --repeat <n>            Repeat index for manifest/run id. Default: 1.
+  --index-timeout-ms <n>  Per-lane indexing timeout. Default: 300000.
+  --query-timeout-ms <n>  Per-lane query timeout. Default: 180000.
+  --evaluator-cwd <dir>   Optional checkout containing contextbench/evaluate.py.
+  --score                 Run official ContextBench evaluator after writing each trajectory.
+
+This is retrieval-only evidence. It does not run an agent, write a patch, execute tests, or prove task success.
+`);
+}
+
+function parseArgs(argv) {
+  const args = {
+    taskPayloads: DEFAULT_PAYLOADS,
+    taskId: DEFAULT_TASK_ID,
+    gold: DEFAULT_GOLD,
+    lanes: DEFAULT_LANES,
+    limit: 6,
+    window: 40,
+    repeat: 1,
+    indexTimeoutMs: 300_000,
+    queryTimeoutMs: 180_000,
+    score: false
+  };
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i];
+    if (arg === '--help' || arg === '-h') args.help = true;
+    else if (arg === '--out') args.out = argv[++i] ?? '';
+    else if (arg === '--task-payloads') args.taskPayloads = argv[++i] ?? '';
+    else if (arg === '--task-id') args.taskId = argv[++i] ?? '';
+    else if (arg === '--gold') args.gold = argv[++i] ?? '';
+    else if (arg === '--lanes')
+      args.lanes = String(argv[++i] ?? '')
+        .split(',')
+        .filter(Boolean);
+    else if (arg === '--limit') args.limit = Number(argv[++i] ?? '6');
+    else if (arg === '--window') args.window = Number(argv[++i] ?? '40');
+    else if (arg === '--repeat') args.repeat = Number(argv[++i] ?? '1');
+    else if (arg === '--index-timeout-ms') args.indexTimeoutMs = Number(argv[++i] ?? '300000');
+    else if (arg === '--query-timeout-ms') args.queryTimeoutMs = Number(argv[++i] ?? '180000');
+    else if (arg === '--evaluator-cwd') args.evaluatorCwd = argv[++i] ?? '';
+    else if (arg === '--score') args.score = true;
+    else throw new Error(`Unknown argument: ${arg}`);
+  }
+  return args;
+}
+
+function stableStringify(value) {
+  if (value === null || typeof value !== 'object') return JSON.stringify(value);
+  if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(',')}]`;
+  const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b));
+  return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(',')}}`;
+}
+
+function sha256Text(value) {
+  return `sha256:${createHash('sha256').update(value, 'utf8').digest('hex')}`;
+}
+
+function sha256File(filePath) {
+  return `sha256:${createHash('sha256').update(readFileSync(filePath)).digest('hex')}`;
+}
+
+function readJson(filePath) {
+  return JSON.parse(readFileSync(filePath, 'utf8'));
+}
+
+function writeJson(filePath, value) {
+  mkdirSync(dirname(filePath), { recursive: true });
+  writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8');
+}
+
+function writeText(filePath, value) {
+  mkdirSync(dirname(filePath), { recursive: true });
+  writeFileSync(filePath, value, 'utf8');
+}
+
+function normalizePath(filePath) {
+  return filePath.replace(/\\/g, '/').replace(/^\.\//, '').replace(/^\/+/, '');
+}
+
+function normalizeRepoPath(repoRoot, filePath) {
+  const normalized = normalizePath(filePath);
+  const root = normalizePath(repoRoot).replace(/\/$/, '');
+  if (normalized.toLowerCase().startsWith(`${root.toLowerCase()}/`)) {
+    return normalized.slice(root.length + 1);
+  }
+  return normalized;
+}
+
+function ensurePhase40Out(outDir) {
+  if (!outDir) throw new Error('--out is required');
+  const resolved = resolve(outDir);
+  const normalized = normalizePath(resolved);
+  if (!normalized.includes('/benchmark-runs/contextbench/phase40/')) {
+    throw new Error(
+      'retrieval gate output must be under benchmark-runs/contextbench/phase40/<session_id>'
+    );
+  }
+  if (normalized.includes('/outputs/'))
+    throw new Error('retrieval gate output must not be under outputs/');
+  mkdirSync(resolved, { recursive: true });
+  return resolved;
+}
+
+function sanitize(value) {
+  return value
+    .replace(/[^a-zA-Z0-9_.-]+/g, '-')
+    .replace(/-+/g, '-')
+    .replace(/^-|-$/g, '');
+}
+
+function loadTask(payloadPath, taskId) {
+  const payload = readJson(resolve(payloadPath));
+  const tasks = Array.isArray(payload.tasks)
+    ? payload.tasks
+    : Object.entries(payload.tasksById ?? {}).map(([instanceId, value]) => ({
+        instance_id: instanceId,
+        ...value
+      }));
+  const task = tasks.find((candidate) => candidate.instance_id === taskId);
+  if (!task) throw new Error(`task id not found in payloads: ${taskId}`);
+  if (!task.problem_statement || !task.repo_checkout_path) {
+    throw new Error(
+      `task ${taskId} is not materialized with problem_statement and repo_checkout_path`
+    );
+  }
+  const checkout = isAbsolute(task.repo_checkout_path)
+    ? task.repo_checkout_path
+    : resolve(dirname(resolve(payloadPath)), task.repo_checkout_path);
+  if (!existsSync(checkout)) throw new Error(`task checkout does not exist: ${checkout}`);
+  return { ...task, repo_checkout_path: checkout };
+}
+
+function tokenize(query) {
+  const stopWords = new Set([
+    'the',
+    'and',
+    'for',
+    'from',
+    'with',
+    'this',
+    'that',
+    'when',
+    'into',
+    'are',
+    'not',
+    'but',
+    'should',
+    'would',
+    'could',
+    'have',
+    'has',
+    'had',
+    'body',
+    'bodies',
+    'method',
+    'methods'
+  ]);
+  return [
+    ...new Set(
+      String(query)
+        .toLowerCase()
+        .match(/[a-z_][a-z0-9_]{2,}|#[0-9]+/g)
+        ?.filter((token) => !stopWords.has(token)) ?? []
+    )
+  ];
+}
+
+function isTextLike(filePath, stats) {
+  if (stats.size > 1_000_000) return false;
+  const ext = extname(filePath).toLowerCase();
+  if (TEXT_EXTENSIONS.has(ext)) return true;
+  return ext === '' && stats.size < 200_000;
+}
+
+function collectFiles(root) {
+  const files = [];
+  const stack = [root];
+  while (stack.length > 0) {
+    const dir = stack.pop();
+    for (const entry of readdirSync(dir, { withFileTypes: true })) {
+      if (entry.isDirectory()) {
+        if (!EXCLUDED_DIRS.has(entry.name)) stack.push(join(dir, entry.name));
+      } else if (entry.isFile()) {
+        const filePath = join(dir, entry.name);
+        const stats = statSync(filePath);
+        if (isTextLike(filePath, stats)) files.push(filePath);
+      }
+    }
+  }
+  return files;
+}
+
+function countOccurrences(text, token) {
+  let count = 0;
+  let start = 0;
+  while (start < text.length) {
+    const index = text.indexOf(token, start);
+    if (index === -1) break;
+    count += 1;
+    start = index + token.length;
+  }
+  return count;
+}
+
+function spanAround(lineNumber, totalLines, window) {
+  const radius = Math.max(1, Math.floor(window / 2));
+  return {
+    start: Math.max(1, lineNumber - radius),
+    end: Math.min(totalLines, lineNumber + radius),
+    full_file: false
+  };
+}
+
+function mergeSpans(spans) {
+  return spans
+    .sort(
+      (a, b) =>
+        a.start - b.start || (a.end ?? Number.MAX_SAFE_INTEGER) - (b.end ?? Number.MAX_SAFE_INTEGER)
+    )
+    .reduce((merged, span) => {
+      const previous = merged[merged.length - 1];
+      if (
+        !previous ||
+        previous.full_file ||
+        span.full_file ||
+        previous.end === null ||
+        span.end === null
+      ) {
+        merged.push(span);
+      } else if (span.start <= previous.end + 1) {
+        previous.end = Math.max(previous.end, span.end);
+      } else {
+        merged.push(span);
+      }
+      return merged;
+    }, []);
+}
+
+function buildTrajectory(task, retrieval) {
+  const predSpans = {};
+  for (const item of retrieval.items) {
+    const file = normalizeRepoPath(task.repo_checkout_path, item.file);
+    predSpans[file] = mergeSpans([...(predSpans[file] ?? []), ...item.spans]);
+  }
+  const predFiles = Object.keys(predSpans).sort();
+  return {
+    instance_id: task.instance_id,
+    repo_url: task.repo_url,
+    commit: task.base_commit,
+    traj_data: {
+      pred_steps: [{ files: predFiles, spans: predSpans }],
+      pred_files: predFiles,
+      pred_spans: predSpans
+    },
+    model_patch: ''
+  };
+}
+
+function buildStructuredAnswer(task, retrieval) {
+  return {
+    answer: {
+      diagnosticRetrievalOnly: true,
+      laneId: retrieval.laneId,
+      method: retrieval.method,
+      itemCount: retrieval.items.length
+    },
+    confidence: retrieval.items.length > 0 ? 'medium' : 'low',
+    evidence: retrieval.items.flatMap((item) =>
+      item.spans.map((span) => ({
+        file: normalizeRepoPath(task.repo_checkout_path, item.file),
+        lineRange: { start: span.start, end: span.end ?? span.start },
+        reason: item.reason
+      }))
+    ),
+    filesReferenced: retrieval.items.map((item) =>
+      normalizeRepoPath(task.repo_checkout_path, item.file)
+    ),
+    symbolsReferenced: [],
+    unsupportedClaims: ['retrieval_only_diagnostic_not_task_success'],
+    readyToEdit: false
+  };
+}
+
+function runRawNative(task, options) {
+  const tokens = tokenize(task.problem_statement);
+  const files = collectFiles(task.repo_checkout_path);
+  const scored = [];
+  const startedAt = Date.now();
+  for (const filePath of files) {
+    let content;
+    try {
+      content = readFileSync(filePath, 'utf8');
+    } catch {
+      continue;
+    }
+    const relativePath = normalizePath(relative(task.repo_checkout_path, filePath));
+    const lowerPath = relativePath.toLowerCase();
+    const lines = content.split(/\r?\n/);
+    let score = 0;
+    for (const token of tokens) {
+      score += countOccurrences(lowerPath, token) * 8;
+    }
+    let bestLine = 1;
+    let bestLineScore = 0;
+    for (let index = 0; index < lines.length; index += 1) {
+      const lowerLine = lines[index].toLowerCase();
+      let lineScore = 0;
+      for (const token of tokens) lineScore += countOccurrences(lowerLine, token);
+      if (lineScore > bestLineScore) {
+        bestLineScore = lineScore;
+        bestLine = index + 1;
+      }
+      score += lineScore;
+    }
+    if (score > 0) {
+      scored.push({
+        file: relativePath,
+        score,
+        bestLine,
+        totalLines: lines.length,
+        reason: `lexical token match: ${tokens.join(', ')}`
+      });
+    }
+  }
+  scored.sort((a, b) => b.score - a.score || a.file.localeCompare(b.file));
+  const items = scored.slice(0, options.limit).map((item) => ({
+    file: item.file,
+    score: item.score,
+    spans: [spanAround(item.bestLine, item.totalLines, options.window)],
+    reason: item.reason
+  }));
+  return {
+    laneId: 'raw-native',
+    method: 'deterministic lexical repository scan over problem statement tokens',
+    status: items.length > 0 ? 'completed' : 'no_answer',
+    setup: {
+      setupCommand: 'none',
+      indexCommand: 'none',
+      setupStatus: 'not_required',
+      indexStatus: 'not_required',
+      setupDurationMs: 0,
+      indexDurationMs: 0
+    },
+    trace: {
+      tokens,
+      filesScanned: files.length,
+      scoredFiles: scored.length,
+      durationMs: Date.now() - startedAt,
+      topScores: scored.slice(0, options.limit).map(({ file, score, bestLine }) => ({
+        file,
+        score,
+        bestLine
+      }))
+    },
+    items
+  };
+}
+
+function runCommand(command, args, options) {
+  const startedAt = Date.now();
+  const result = spawnSync(command, args, {
+    cwd: options.cwd,
+    env: options.env,
+    encoding: 'utf8',
+    input: '',
+    timeout: options.timeoutMs ?? 120_000
+  });
+  return {
+    command,
+    args,
+    cwd: options.cwd,
+    status: typeof result.status === 'number' ? result.status : null,
+    signal: result.signal ?? null,
+    error: result.error?.message ?? null,
+    stdout: result.stdout ?? '',
+    stderr: result.stderr ?? '',
+    durationMs: Date.now() - startedAt
+  };
+}
+
+function runJCodeMunchMcpCalls(calls, timeoutMs) {
+  return new Promise((resolve) => {
+    const startedAt = Date.now();
+    const command = 'python';
+    const args = [
+      '-m',
+      'jcodemunch_mcp.server',
+      'serve',
+      '--transport',
+      'stdio',
+      '--log-level',
+      'ERROR'
+    ];
+    const child = spawn(command, args, {
+      cwd: process.cwd(),
+      env: { ...process.env, JCODEMUNCH_USE_AI_SUMMARIES: 'false' },
+      stdio: ['pipe', 'pipe', 'pipe']
+    });
+    const messages = new Map();
+    let stdout = '';
+    let stderr = '';
+    let lineBuffer = '';
+    let settled = false;
+    const finish = (status, error = null) => {
+      if (settled) return;
+      settled = true;
+      clearTimeout(timer);
+      if (!child.killed) child.kill();
+      resolve({
+        command,
+        args,
+        cwd: process.cwd(),
+        status,
+        error,
+        stdout,
+        stderr,
+        durationMs: Date.now() - startedAt,
+        messages: calls.map((call, index) => messages.get(index + 2) ?? null)
+      });
+    };
+    const send = (message) => {
+      child.stdin.write(`${JSON.stringify(message)}\n`);
+    };
+    const maybeComplete = () => {
+      if (calls.every((_, index) => messages.has(index + 2))) finish(0);
+    };
+    const handleMessage = (message) => {
+      if (message.id === 1) {
+        send({ jsonrpc: '2.0', method: 'notifications/initialized', params: {} });
+        calls.forEach((call, index) => {
+          send({
+            jsonrpc: '2.0',
+            id: index + 2,
+            method: 'tools/call',
+            params: { name: call.name, arguments: call.arguments }
+          });
+        });
+        return;
+      }
+      if (typeof message.id === 'number' && message.id >= 2) {
+        messages.set(message.id, message);
+        maybeComplete();
+      }
+    };
+    child.stdout.on('data', (chunk) => {
+      const text = chunk.toString();
+      stdout += text;
+      lineBuffer += text;
+      const lines = lineBuffer.split(/\r?\n/);
+      lineBuffer = lines.pop() ?? '';
+      for (const line of lines) {
+        if (!line.trim()) continue;
+        try {
+          handleMessage(JSON.parse(line));
+        } catch (error) {
+          stderr += `\nfailed to parse MCP stdout line: ${error instanceof Error ? error.message : String(error)}`;
+        }
+      }
+    });
+    child.stderr.on('data', (chunk) => {
+      stderr += chunk.toString();
+    });
+    child.on('error', (error) => finish(null, error.message));
+    child.on('close', (status) => {
+      if (!settled) finish(status);
+    });
+    const timer = setTimeout(
+      () => finish(null, `jCodeMunch MCP timed out after ${timeoutMs}ms`),
+      timeoutMs
+    );
+    send({
+      jsonrpc: '2.0',
+      id: 1,
+      method: 'initialize',
+      params: {
+        protocolVersion: '2024-11-05',
+        capabilities: {},
+        clientInfo: { name: 'contextbench-retrieval-gate', version: '0.0.0' }
+      }
+    });
+  });
+}
+
+function codebaseContextBaseCommand() {
+  const distIndex = resolve('dist/index.js');
+  if (existsSync(distIndex))
+    return { command: process.execPath, prefixArgs: [distIndex], source: 'local-dist' };
+  const npmCommand = process.platform === 'win32' ? 'npm.cmd' : 'npm';
+  return {
+    command: npmCommand,
+    prefixArgs: ['exec', '--', 'codebase-context'],
+    source: 'npm-exec'
+  };
+}
+
+function parseJsonOutput(commandResult) {
+  const trimmed = commandResult.stdout.trim();
+  if (!trimmed) return { value: null, error: 'empty_stdout' };
+  try {
+    return { value: JSON.parse(trimmed), error: null };
+  } catch (error) {
+    return { value: null, error: error instanceof Error ? error.message : String(error) };
+  }
+}
+
+function parseSearchFile(value) {
+  const normalized = normalizePath(String(value ?? ''));
+  const rangeMatch = normalized.match(/^(.*):(\d+)-(\d+)$/);
+  if (rangeMatch) {
+    return {
+      file: rangeMatch[1],
+      span: { start: Number(rangeMatch[2]), end: Number(rangeMatch[3]), full_file: false }
+    };
+  }
+  const lineMatch = normalized.match(/^(.*):(\d+)$/);
+  if (lineMatch) {
+    const line = Number(lineMatch[2]);
+    return { file: lineMatch[1], span: { start: line, end: line, full_file: false } };
+  }
+  return { file: normalized, span: { start: 1, end: null, full_file: true } };
+}
+
+function expandSpan(span, window) {
+  if (span.full_file || span.end === null) return span;
+  const radius = Math.max(0, Math.floor(window / 2));
+  return {
+    start: Math.max(1, span.start - radius),
+    end: Math.max(span.end, span.end + radius),
+    full_file: false
+  };
+}
+
+function capSpanToFile(repoRoot, file, span) {
+  if (span.full_file || span.end === null) return span;
+  try {
+    const lineCount = readFileSync(join(repoRoot, file), 'utf8').split(/\r?\n/).length;
+    return { ...span, end: Math.min(lineCount, span.end) };
+  } catch {
+    return span;
+  }
+}
+
+function runCodebaseContext(task, options) {
+  const base = codebaseContextBaseCommand();
+  const env = {
+    ...process.env,
+    CODEBASE_ROOT: task.repo_checkout_path,
+    CODEBASE_CONTEXT_ASCII: '1'
+  };
+  const reindex = runCommand(base.command, [...base.prefixArgs, 'reindex', '--json'], {
+    cwd: process.cwd(),
+    env,
+    timeoutMs: options.indexTimeoutMs
+  });
+  if (reindex.status !== 0) {
+    return {
+      laneId: 'codebase-context',
+      method: 'codebase-context CLI reindex/search JSON output',
+      status: 'index_failed',
+      setup: {
+        setupCommand: base.source,
+        indexCommand: `${base.command} ${[...base.prefixArgs, 'reindex', '--json'].join(' ')}`,
+        setupStatus: 'completed',
+        indexStatus: 'index_failed',
+        setupDurationMs: 0,
+        indexDurationMs: reindex.durationMs
+      },
+      trace: { commandSource: base.source, reindex, search: null, parseError: null },
+      items: []
+    };
+  }
+  const searchArgs = [
+    ...base.prefixArgs,
+    'search',
+    '--query',
+    task.problem_statement,
+    '--intent',
+    'explore',
+    '--limit',
+    String(options.limit),
+    '--json'
+  ];
+  const search = runCommand(base.command, searchArgs, {
+    cwd: process.cwd(),
+    env,
+    timeoutMs: options.queryTimeoutMs
+  });
+  const parsed = parseJsonOutput(search);
+  if (search.status !== 0 || parsed.error) {
+    return {
+      laneId: 'codebase-context',
+      method: 'codebase-context CLI reindex/search JSON output',
+      status: search.status === 0 ? 'invalid_schema' : 'tool_error',
+      setup: {
+        setupCommand: base.source,
+        indexCommand: `${base.command} ${[...base.prefixArgs, 'reindex', '--json'].join(' ')}`,
+        setupStatus: 'completed',
+        indexStatus: 'completed',
+        setupDurationMs: 0,
+        indexDurationMs: reindex.durationMs
+      },
+      trace: { commandSource: base.source, reindex, search, parseError: parsed.error },
+      items: []
+    };
+  }
+  const rawResults = Array.isArray(parsed.value?.results) ? parsed.value.results : [];
+  const itemsByFile = new Map();
+  for (const result of rawResults.slice(0, options.limit)) {
+    const parsedFile = parseSearchFile(result.file);
+    if (!parsedFile.file) continue;
+    const existing = itemsByFile.get(parsedFile.file) ?? {
+      file: parsedFile.file,
+      score: Number(result.score ?? 0),
+      spans: [],
+      reason: result.relevanceReason || result.summary || 'codebase-context search result'
+    };
+    existing.score = Math.max(existing.score, Number(result.score ?? 0));
+    existing.spans.push(
+      capSpanToFile(
+        task.repo_checkout_path,
+        parsedFile.file,
+        expandSpan(parsedFile.span, options.window)
+      )
+    );
+    itemsByFile.set(parsedFile.file, existing);
+  }
+  const items = [...itemsByFile.values()].sort(
+    (a, b) => b.score - a.score || a.file.localeCompare(b.file)
+  );
+  return {
+    laneId: 'codebase-context',
+    method: 'codebase-context CLI reindex/search JSON output',
+    status: items.length > 0 ? 'completed' : 'no_answer',
+    setup: {
+      setupCommand: base.source,
+      indexCommand: `${base.command} ${[...base.prefixArgs, 'reindex', '--json'].join(' ')}`,
+      setupStatus: 'completed',
+      indexStatus: 'completed',
+      setupDurationMs: 0,
+      indexDurationMs: reindex.durationMs
+    },
+    trace: {
+      commandSource: base.source,
+      reindex,
+      search,
+      parseError: null,
+      searchQuality: parsed.value?.searchQuality ?? null,
+      totalResults: parsed.value?.totalResults ?? rawResults.length,
+      rawResultFiles: rawResults.map((result) => result.file)
+    },
+    items
+  };
+}
+
+function parseJCodeMunchToolJson(message) {
+  const text = message?.result?.content?.find?.((part) => part?.type === 'text')?.text;
+  if (!text) return { value: null, error: 'missing_text_content' };
+  try {
+    return { value: JSON.parse(text), error: null };
+  } catch (error) {
+    return { value: null, error: error instanceof Error ? error.message : String(error) };
+  }
+}
+
+async function runJCodeMunch(task, options) {
+  const indexCall = {
+    name: 'index_folder',
+    arguments: {
+      path: task.repo_checkout_path,
+      use_ai_summaries: false,
+      incremental: true,
+      follow_symlinks: false,
+      extra_ignore_patterns: ['.codebase-context/**']
+    }
+  };
+  const index = await runJCodeMunchMcpCalls([indexCall], options.indexTimeoutMs);
+  const indexMessage = index.messages[0];
+  const indexParsed = parseJCodeMunchToolJson(indexMessage);
+  const repo = indexParsed.value?.repo;
+  if (index.status !== 0 || index.error || !repo) {
+    return {
+      laneId: 'jcodemunch-repomapper',
+      method: 'jCodeMunch MCP index_folder plus search_symbols over deterministic problem tokens',
+      status: 'index_failed',
+      setup: {
+        setupCommand: 'python -m jcodemunch_mcp.server --version',
+        indexCommand: 'MCP index_folder',
+        setupStatus: 'completed',
+        indexStatus: 'index_failed',
+        setupDurationMs: 0,
+        indexDurationMs: index.durationMs
+      },
+      trace: { index, indexParseError: indexParsed.error, repo: repo ?? null, searches: [] },
+      items: []
+    };
+  }
+
+  const searchCalls = codeGraphContextQueries(task.problem_statement).map((query) => ({
+    name: 'search_symbols',
+    arguments: {
+      repo,
+      query,
+      max_results: options.limit,
+      detail_level: 'compact',
+      semantic: false
+    }
+  }));
+  const search = await runJCodeMunchMcpCalls(searchCalls, options.queryTimeoutMs);
+  const itemsByFile = new Map();
+  const searches = search.messages.map((message, index) => {
+    const parsed = parseJCodeMunchToolJson(message);
+    const query = searchCalls[index]?.arguments?.query ?? '';
+    const results = Array.isArray(parsed.value?.results) ? parsed.value.results : [];
+    for (const [resultIndex, result] of results.entries()) {
+      const file = normalizeRepoPath(task.repo_checkout_path, String(result.file ?? ''));
+      const line = Number(result.line ?? 1);
+      if (!file || file.startsWith('..') || !Number.isFinite(line)) continue;
+      const existing = itemsByFile.get(file) ?? {
+        file,
+        score: 0,
+        spans: [],
+        reason: `jCodeMunch search_symbols match for problem-derived query "${query}"`
+      };
+      existing.score += 1 / (index + 1 + resultIndex / 100);
+      existing.spans.push(spanAround(line, Number.MAX_SAFE_INTEGER, options.window));
+      itemsByFile.set(file, existing);
+    }
+    return { query, message, parsed, resultCount: results.length };
+  });
+  const items = [...itemsByFile.values()]
+    .map((item) => ({
+      ...item,
+      spans: item.spans.map((span) => capSpanToFile(task.repo_checkout_path, item.file, span))
+    }))
+    .sort((a, b) => b.score - a.score || a.file.localeCompare(b.file))
+    .slice(0, options.limit);
+  return {
+    laneId: 'jcodemunch-repomapper',
+    method: 'jCodeMunch MCP index_folder plus search_symbols over deterministic problem tokens',
+    status: items.length > 0 ? 'completed' : 'no_answer',
+    setup: {
+      setupCommand: 'python -m jcodemunch_mcp.server --version',
+      indexCommand: 'MCP index_folder',
+      setupStatus: 'completed',
+      indexStatus: 'completed',
+      setupDurationMs: 0,
+      indexDurationMs: index.durationMs
+    },
+    trace: {
+      repo,
+      index,
+      indexSummary: indexParsed.value,
+      search,
+      queryCount: searches.length,
+      searches,
+      rawResultFiles: items.map((item) => item.file)
+    },
+    items
+  };
+}
+
+function parseCodeGraphContextTable(stdout) {
+  const rows = [];
+  let current = null;
+  for (const line of String(stdout ?? '').split(/\r?\n/)) {
+    if (!line.includes('│')) continue;
+    const parts = line.split('│').slice(1, -1);
+    if (parts.length < 3) continue;
+    const name = parts[0].trim();
+    const type = parts[1].trim();
+    const locationPart = parts[2].trim();
+    if (name === 'Name' || type === 'Type' || locationPart === 'Location') continue;
+    if (name) {
+      if (current) rows.push(current);
+      current = { name, type, locationParts: locationPart ? [locationPart] : [] };
+    } else if (current && locationPart) {
+      current.locationParts.push(locationPart);
+    }
+  }
+  if (current) rows.push(current);
+  return rows
+    .map((row) => {
+      const location = row.locationParts.join('');
+      const match = location.match(/^(.*):(\d+)$/);
+      if (!match) return null;
+      return {
+        name: row.name,
+        type: row.type,
+        file: normalizePath(match[1]),
+        line: Number(match[2])
+      };
+    })
+    .filter(Boolean);
+}
+
+function codeGraphContextQueries(problemStatement) {
+  const tokens = tokenize(problemStatement).filter((token) => !token.startsWith('#'));
+  const rankedTokens = [...tokens].sort((a, b) => b.length - a.length || a.localeCompare(b));
+  return [problemStatement.replace(/\s+/g, ' ').trim(), ...rankedTokens].filter(Boolean);
+}
+
+function runCodeGraphContext(task, options) {
+  const env = {
+    ...process.env,
+    PYTHONUTF8: '1',
+    PYTHONIOENCODING: 'utf-8'
+  };
+  const index = runCommand('python', ['-m', 'codegraphcontext', 'index', task.repo_checkout_path], {
+    cwd: process.cwd(),
+    env,
+    timeoutMs: options.indexTimeoutMs
+  });
+  if (index.status !== 0) {
+    return {
+      laneId: 'codegraphcontext',
+      method: 'CodeGraphContext CLI index plus find content over deterministic problem tokens',
+      status: 'index_failed',
+      setup: {
+        setupCommand: 'python -m codegraphcontext --version',
+        indexCommand: `python -m codegraphcontext index ${task.repo_checkout_path}`,
+        setupStatus: 'completed',
+        indexStatus: 'index_failed',
+        setupDurationMs: 0,
+        indexDurationMs: index.durationMs
+      },
+      trace: { index, queries: [], parseError: null },
+      items: []
+    };
+  }
+
+  const queries = [];
+  const itemsByFile = new Map();
+  for (const query of codeGraphContextQueries(task.problem_statement)) {
+    if (itemsByFile.size >= options.limit) break;
+    const queryResult = runCommand('python', ['-m', 'codegraphcontext', 'find', 'content', query], {
+      cwd: process.cwd(),
+      env,
+      timeoutMs: options.queryTimeoutMs
+    });
+    const parsedRows =
+      queryResult.status === 0
+        ? parseCodeGraphContextTable(`${queryResult.stdout}\n${queryResult.stderr}`)
+        : [];
+    queries.push({ query, result: queryResult, parsedRows });
+    for (const [indexInResult, row] of parsedRows.entries()) {
+      const file = normalizeRepoPath(task.repo_checkout_path, row.file);
+      if (!file || file.startsWith('..')) continue;
+      const existing = itemsByFile.get(file) ?? {
+        file,
+        score: 0,
+        spans: [],
+        reason: `CodeGraphContext content match for problem-derived query "${query}"`
+      };
+      existing.score += 1 / (queries.length + indexInResult / 100);
+      existing.spans.push(spanAround(row.line, Number.MAX_SAFE_INTEGER, options.window));
+      itemsByFile.set(file, existing);
+      if (itemsByFile.size >= options.limit) break;
+    }
+  }
+
+  const items = [...itemsByFile.values()]
+    .map((item) => ({
+      ...item,
+      spans: item.spans.map((span) => capSpanToFile(task.repo_checkout_path, item.file, span))
+    }))
+    .sort((a, b) => b.score - a.score || a.file.localeCompare(b.file))
+    .slice(0, options.limit);
+  return {
+    laneId: 'codegraphcontext',
+    method: 'CodeGraphContext CLI index plus find content over deterministic problem tokens',
+    status: items.length > 0 ? 'completed' : 'no_answer',
+    setup: {
+      setupCommand: 'python -m codegraphcontext --version',
+      indexCommand: `python -m codegraphcontext index ${task.repo_checkout_path}`,
+      setupStatus: 'completed',
+      indexStatus: 'completed',
+      setupDurationMs: 0,
+      indexDurationMs: index.durationMs
+    },
+    trace: {
+      index,
+      queryCount: queries.length,
+      queries,
+      rawResultFiles: items.map((item) => item.file)
+    },
+    items
+  };
+}
+
+async function runLane(lane, task, options) {
+  if (lane === 'raw-native') return runRawNative(task, options);
+  if (lane === 'codebase-context') return runCodebaseContext(task, options);
+  if (lane === 'jcodemunch-repomapper') return runJCodeMunch(task, options);
+  if (lane === 'codegraphcontext') return runCodeGraphContext(task, options);
+  throw new Error(`unsupported retrieval lane: ${lane}`);
+}
+
+function hasOfficialEvaluator(cwd) {
+  return existsSync(join(cwd, 'contextbench', 'evaluate.py'));
+}
+
+function resolveEvaluatorCwd(args) {
+  if (args.evaluatorCwd) {
+    const resolved = resolve(args.evaluatorCwd);
+    if (!hasOfficialEvaluator(resolved)) {
+      throw new Error(`--evaluator-cwd does not contain contextbench/evaluate.py: ${resolved}`);
+    }
+    return resolved;
+  }
+  const moduleCheck = runCommand('python', ['-m', 'contextbench.evaluate', '--help'], {
+    cwd: process.cwd(),
+    env: process.env,
+    timeoutMs: 30_000
+  });
+  if (moduleCheck.status === 0) return process.cwd();
+  const candidates = [
+    'benchmark-runs/contextbench/phase40/evaluator-probe-20260427/ContextBench-official'
+  ];
+  for (const candidate of candidates) {
+    const resolved = resolve(candidate);
+    if (hasOfficialEvaluator(resolved)) return resolved;
+  }
+  throw new Error(
+    'official evaluator unavailable; pass --evaluator-cwd <ContextBench checkout> or run the evaluator probe first'
+  );
+}
+
+function scoreTrajectory(goldPath, trajectoryPath, outputPath, evaluatorCwd, cachePath) {
+  const args = [
+    '-m',
+    'contextbench.evaluate',
+    '--gold',
+    goldPath,
+    '--pred',
+    trajectoryPath,
+    '--cache',
+    cachePath,
+    '--out',
+    outputPath
+  ];
+  const result = runCommand('python', args, {
+    cwd: evaluatorCwd,
+    env: process.env,
+    timeoutMs: 120_000
+  });
+  let metrics = null;
+  if (result.status === 0 && existsSync(outputPath)) {
+    const firstLine = readFileSync(outputPath, 'utf8').trim().split('\n')[0];
+    if (firstLine) {
+      try {
+        metrics = JSON.parse(firstLine);
+      } catch {
+        metrics = null;
+      }
+    }
+  }
+  return {
+    status: result.status === 0 ? 'completed' : 'judge_failed',
+    mode: result.status === 0 ? 'official_evaluator' : 'official_evaluator_failed',
+    claimBearing: false,
+    retrievalOnly: true,
+    command: `python ${args.join(' ')}`,
+    evaluatorCwd,
+    exitStatus: result.status,
+    stdout: result.stdout,
+    stderr: result.stderr,
+    outputPath,
+    metrics
+  };
+}
+
+function buildRunPaths(sessionRoot, runId) {
+  const runDir = join(sessionRoot, 'runs', runId);
+  return {
+    runDir,
+    prompt: join(runDir, 'retrieval-query.txt'),
+    setupIndex: join(runDir, 'setup-index.json'),
+    rawTrace: join(runDir, 'raw-trace.json'),
+    structuredAnswer: join(runDir, 'structured-answer.json'),
+    trajectory: join(runDir, 'trajectory.json'),
+    score: join(runDir, 'score.json'),
+    officialResults: join(runDir, 'official-results.jsonl')
+  };
+}
+
+function appendManifest(sessionRoot, row) {
+  appendFileSync(join(sessionRoot, 'run-manifest.jsonl'), `${JSON.stringify(row)}\n`, 'utf8');
+}
+
+function artifactHashIfPresent(filePath) {
+  return existsSync(filePath) ? sha256File(filePath) : null;
+}
+
+function writeSessionScratchpad(sessionRoot, task, args) {
+  const scratchpadPath = join(sessionRoot, 'RETRIEVAL-GATE-SCRATCHPAD.json');
+  const value = {
+    createdAt: new Date().toISOString(),
+    claimBearing: false,
+    evidenceType: 'retrieval_only_diagnostic',
+    claimLimits: [
+      'No agent patch was produced.',
+      'No tests were run in the target repository.',
+      'Official evaluator scores measure retrieved context overlap only.',
+      'Scorer-only gold is used after trajectories are materialized, never during retrieval.'
+    ],
+    task: {
+      instance_id: task.instance_id,
+      repo_url: task.repo_url,
+      base_commit: task.base_commit,
+      repo_checkout_path: task.repo_checkout_path,
+      problem_statement_hash: task.problem_statement_hash
+    },
+    args: {
+      lanes: args.lanes,
+      limit: args.limit,
+      window: args.window,
+      repeat: args.repeat,
+      score: args.score,
+      gold: args.score ? resolve(args.gold) : null
+    }
+  };
+  writeJson(scratchpadPath, value);
+}
+
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help) {
+    help();
+    return;
+  }
+  if (!Number.isInteger(args.limit) || args.limit < 1)
+    throw new Error('--limit must be a positive integer');
+  if (!Number.isInteger(args.window) || args.window < 1)
+    throw new Error('--window must be a positive integer');
+  if (!Number.isInteger(args.repeat) || args.repeat < 1)
+    throw new Error('--repeat must be a positive integer');
+  if (!Number.isInteger(args.indexTimeoutMs) || args.indexTimeoutMs < 1)
+    throw new Error('--index-timeout-ms must be a positive integer');
+  if (!Number.isInteger(args.queryTimeoutMs) || args.queryTimeoutMs < 1)
+    throw new Error('--query-timeout-ms must be a positive integer');
+  const sessionRoot = ensurePhase40Out(args.out);
+  const task = loadTask(args.taskPayloads, args.taskId);
+  const goldPath = resolve(args.gold);
+  if (args.score && !existsSync(goldPath)) throw new Error(`gold file missing: ${goldPath}`);
+  const evaluatorCwd = args.score ? resolveEvaluatorCwd(args) : null;
+  writeSessionScratchpad(sessionRoot, task, args);
+
+  const rows = [];
+  for (const lane of args.lanes) {
+    const runId = sanitize(`${lane}-${task.instance_id}-${args.repeat}-retrieval`);
+    const paths = buildRunPaths(sessionRoot, runId);
+    if (existsSync(paths.runDir))
+      throw new Error(`run directory already exists; refusing overwrite: ${paths.runDir}`);
+    const startedAt = new Date().toISOString();
+    const retrieval = await runLane(lane, task, {
+      limit: args.limit,
+      window: args.window,
+      indexTimeoutMs: args.indexTimeoutMs,
+      queryTimeoutMs: args.queryTimeoutMs
+    });
+    const trajectory = buildTrajectory(task, retrieval);
+    const answer = buildStructuredAnswer(task, retrieval);
+    writeText(paths.prompt, task.problem_statement);
+    writeJson(paths.setupIndex, retrieval.setup);
+    writeJson(paths.rawTrace, {
+      laneId: lane,
+      claimBearing: false,
+      retrievalOnly: true,
+      notAgentTaskSuccess: true,
+      workingDirectory: task.repo_checkout_path,
+      task: {
+        instance_id: task.instance_id,
+        repo_url: task.repo_url,
+        base_commit: task.base_commit,
+        problem_statement_hash: task.problem_statement_hash
+      },
+      method: retrieval.method,
+      status: retrieval.status,
+      trace: retrieval.trace,
+      retrievedItems: retrieval.items,
+      scriptedAgentDecisions: true,
+      scorerGoldReadDuringRetrieval: false
+    });
+    writeJson(paths.structuredAnswer, answer);
+    writeJson(paths.trajectory, trajectory);
+    const score = args.score
+      ? scoreTrajectory(
+          goldPath,
+          paths.trajectory,
+          paths.officialResults,
+          evaluatorCwd,
+          join(sessionRoot, 'score-cache')
+        )
+      : {
+          status: 'not_scored',
+          mode: 'not_requested',
+          claimBearing: false,
+          retrievalOnly: true,
+          fallbackReason: 'run_without_score_flag'
+        };
+    writeJson(paths.score, score);
+    const completedAt = new Date().toISOString();
+    const row = {
+      run_id: runId,
+      lane_id: lane,
+      task_id: task.instance_id,
+      repeat_index: args.repeat,
+      status: score.status === 'completed' ? retrieval.status : score.status,
+      started_at: startedAt,
+      completed_at: completedAt,
+      raw_trace_path: paths.rawTrace,
+      structured_answer_path: paths.structuredAnswer,
+      trajectory_path: paths.trajectory,
+      score_path: paths.score,
+      setup_index_path: paths.setupIndex,
+      prompt_path: paths.prompt,
+      setupIndex: retrieval.setup,
+      taskExecution: {
+        executor: 'retrieval-script',
+        retrievalOnly: true,
+        taskWallTimeMs: new Date(completedAt).getTime() - new Date(startedAt).getTime()
+      },
+      scoring: {
+        officialEvaluatorFirst: Boolean(args.score),
+        claimBearing: false,
+        retrievalOnly: true,
+        officialResultsPath: args.score ? paths.officialResults : null
+      },
+      hashes: {
+        prompt: sha256Text(task.problem_statement),
+        rawTrace: artifactHashIfPresent(paths.rawTrace),
+        structuredAnswer: artifactHashIfPresent(paths.structuredAnswer),
+        trajectory: artifactHashIfPresent(paths.trajectory),
+        score: artifactHashIfPresent(paths.score),
+        officialResults: artifactHashIfPresent(paths.officialResults)
+      }
+    };
+    appendManifest(sessionRoot, row);
+    rows.push(row);
+  }
+  writeJson(join(sessionRoot, 'RETRIEVAL-GATE-SUMMARY.json'), {
+    completedAt: new Date().toISOString(),
+    claimBearing: false,
+    retrievalOnly: true,
+    taskId: task.instance_id,
+    rows
+  });
+  console.log(`retrieval gate wrote ${join(sessionRoot, 'RETRIEVAL-GATE-SUMMARY.json')}`);
+}
+
+try {
+  await main();
+} catch (error) {
+  console.error(error instanceof Error ? error.message : String(error));
+  process.exit(1);
+}
diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs
new file mode 100644
index 0000000..11285f4
--- /dev/null
+++ b/scripts/contextbench-runner.mjs
@@ -0,0 +1,3586 @@
+#!/usr/bin/env node
+import { createHash } from 'node:crypto';
+import { execFileSync, spawnSync } from 'node:child_process';
+import {
+  appendFileSync,
+  existsSync,
+  mkdirSync,
+  readdirSync,
+  readFileSync,
+  statSync,
+  writeFileSync
+} from 'node:fs';
+import { dirname, isAbsolute, join, relative, resolve } from 'node:path';
+import { fileURLToPath } from 'node:url';
+import os from 'node:os';
+
+const RUNNER_SOURCE_PATH = fileURLToPath(import.meta.url);
+
+const FIXTURES = {
+  protocol: 'tests/fixtures/contextbench-benchmark-protocol.json',
+  lanes: 'tests/fixtures/contextbench-lanes.json',
+  corrections: 'tests/fixtures/contextbench-corrections.json',
+  manifest: 'tests/fixtures/contextbench-task-manifest.json',
+  laneToolCards: 'tests/fixtures/contextbench-lane-tool-cards.json',
+  laneSetupEvidence: 'tests/fixtures/contextbench-lane-setup-evidence.json',
+  codebaseContextBaselineArms: 'tests/fixtures/contextbench-codebase-context-baseline-arms.json'
+};
+
+const TERMINAL_LANE_SETUP_STATUSES = new Set([
+  'ready_for_phase40',
+  'setup_failed',
+  'index_failed',
+  'tool_error',
+  'invasive_setup_blocked'
+]);
+
+const CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS = [
+  'answer',
+  'confidence',
+  'evidence',
+  'filesReferenced',
+  'symbolsReferenced',
+  'unsupportedClaims',
+  'readyToEdit'
+];
+
+const CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA = {
+  type: 'object',
+  additionalProperties: false,
+  required: CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS,
+  properties: {
+    answer: { type: ['object', 'array', 'string', 'number', 'boolean', 'null'] },
+    confidence: { type: 'string', enum: ['low', 'medium', 'high'] },
+    evidence: {
+      type: 'array',
+      items: {
+        type: 'object',
+        additionalProperties: false,
+        required: ['file', 'lineRange', 'reason'],
+        properties: {
+          file: { type: 'string', minLength: 1 },
+          lineRange: {
+            type: 'object',
+            additionalProperties: false,
+            required: ['start', 'end'],
+            properties: {
+              start: { type: 'integer', minimum: 1 },
+              end: { type: 'integer', minimum: 1 }
+            }
+          },
+          reason: { type: 'string', minLength: 1 }
+        }
+      }
+    },
+    filesReferenced: { type: 'array', items: { type: 'string' } },
+    symbolsReferenced: { type: 'array', items: { type: 'string' } },
+    unsupportedClaims: { type: 'array', items: { type: 'string' } },
+    readyToEdit: { type: 'boolean' }
+  }
+};
+
+const EVIDENCE_REFERENCE_FIELDS = ['file', 'lineRange', 'reason'];
+const LINE_RANGE_FIELDS = ['start', 'end'];
+
+function diagnosticFallbackScoring(fixtures, fallbackReason, extra = {}) {
+  return {
+    officialEvaluatorFirst: false,
+    officialEvaluatorAttempted: false,
+    officialEvaluatorInvoked: false,
+    command: fixtures.protocol.benchmarkTarget.officialEvaluatorCommand,
+    claimBearing: false,
+    fallbackReason,
+    ...extra
+  };
+}
+
+function officialEvaluatorCommandParts() {
+  const override = process.env.CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND;
+  if (!override) return { command: 'python', prefixArgs: ['-m', 'contextbench.evaluate'] };
+  let parts;
+  try {
+    parts = JSON.parse(override);
+  } catch {
+    throw new Error('CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND must be a JSON array');
+  }
+  if (
+    !Array.isArray(parts) ||
+    parts.length === 0 ||
+    parts.some((part) => typeof part !== 'string')
+  ) {
+    throw new Error('CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND must be a non-empty JSON string array');
+  }
+  return { command: parts[0], prefixArgs: parts.slice(1) };
+}
+
+const BLOCKED_LANE_SETUP_STATUSES = new Set([
+  'setup_failed',
+  'index_failed',
+  'tool_error',
+  'invasive_setup_blocked'
+]);
+
+function help() {
+  console.log(`ContextBench Phase 38/39/40 runner
+
+Usage:
+  node scripts/contextbench-runner.mjs --help
+  node scripts/contextbench-runner.mjs --validate-fixtures
+  node scripts/contextbench-runner.mjs --validate-lane-setup
+  node scripts/contextbench-runner.mjs --baseline-snapshot --out benchmark-runs/contextbench/phase40/<session_id>
+  node scripts/contextbench-runner.mjs --baseline-snapshot --out benchmark-runs/contextbench/phase41/<session_id>
+  node scripts/contextbench-runner.mjs --baseline-run --session benchmark-runs/contextbench/phase40/<session_id> --executor fake --lane <lane-id> --task-id <instance-id> --repeat <n>
+  node scripts/contextbench-runner.mjs --baseline-run --session benchmark-runs/contextbench/phase41/<session_id> --executor claude --task-payloads <task-payloads.json> --lane <lane-id> --task-id <instance-id> --repeat <n>
+  node scripts/contextbench-runner.mjs --baseline-refresh --session benchmark-runs/contextbench/phase41/<session_id>
+  node scripts/contextbench-runner.mjs --baseline-validate --session benchmark-runs/contextbench/phase41/<session_id>
+  node scripts/contextbench-runner.mjs --baseline-seal --session benchmark-runs/contextbench/phase41/<session_id>
+  node scripts/contextbench-runner.mjs --phase42-verify --session benchmark-runs/contextbench/phase41/<session_id> [--out report.json] [--quiet]
+  node scripts/contextbench-runner.mjs --setup-index-measure --session benchmark-runs/contextbench/phase41/<session_id> --lane raw-native
+  node scripts/contextbench-runner.mjs --setup-index-import --session benchmark-runs/contextbench/phase41/<session_id> --lane <lane-id> --input setup-index.json
+  node scripts/contextbench-runner.mjs --baseline-validate-arms tests/fixtures/contextbench-codebase-context-baseline-arms.json
+  node scripts/contextbench-runner.mjs --print-claude-args --model haiku
+  node scripts/contextbench-runner.mjs --print-answer-schema
+  node scripts/contextbench-runner.mjs --dry-run --executor fake --lane <lane-id> --task-id <instance-id> --repeat <n> --out <dir>
+  node scripts/contextbench-runner.mjs --score-probe --out <dir>
+
+Modes:
+  --validate-fixtures  Validate frozen protocol, task manifest, lane governance, and lane tool cards.
+  --validate-lane-setup  Validate Phase 39 setup/index readiness or terminal blocker evidence only.
+  --baseline-snapshot  Capture dirty-worktree state before any Phase 40 baseline attempt.
+  --baseline-run       Write a baseline attempt row and artifacts. Fake executor is test-only; live executors require task payloads and materialized checkouts.
+  --baseline-refresh   Re-hash an interrupted Phase 40/41 session without running live agents.
+  --baseline-validate  Validate a Phase 40/41 session root, hashes, reservations, rows, and artifact paths.
+  --baseline-seal      Seal only after terminal evidence and the Phase 42 evidence gate both pass.
+  --phase42-verify     Read-only Phase 42 evidence gate over a Phase 40/41 session; exits non-zero unless claim-pass.
+  --quiet              With --phase42-verify, write only the concise pass/fail line to stdout.
+  --setup-index-measure  Capture safe setup/index measurement artifacts before task execution.
+  --setup-index-import   Import pre-captured setup/index evidence without running setup commands.
+  --baseline-validate-arms  Validate diagnostic codebase-context baseline arm metadata.
+  --print-claude-args  Print the Claude CLI args used for schema-gated live attempts.
+  --print-answer-schema  Print the structured answer JSON Schema used by live attempts.
+  --dry-run            Write non-claim-bearing fake-executor smoke artifacts and one append-only manifest row.
+  --score-probe        Write a synthetic non-claim-bearing diagnostic fallback artifact without live Claude.
+
+Phase 39 boundary:
+  Lane setup validation and probes are readiness/blocker evidence only, always claimBearing=false.
+  Phase 40 owns dirty-worktree baseline capture, task x repeat execution, and non-claim-bearing baseline artifacts while claimAllowed=false.
+
+Anti-scripting boundary:
+  This runner standardizes prompt, lane card, budgets, traces, structured answer JSON, trajectory, and score artifacts.
+  It must not script agent decisions, file selection, query rewrites, answer content, or evidence selection.
+`);
+}
+
+function parseArgs(argv) {
+  const args = { repeat: 1 };
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i];
+    if (arg === '--help' || arg === '-h') args.help = true;
+    else if (arg === '--validate-fixtures') args.validateFixtures = true;
+    else if (arg === '--validate-lane-setup') args.validateLaneSetup = true;
+    else if (arg === '--baseline-snapshot') args.baselineSnapshot = true;
+    else if (arg === '--baseline-run') args.baselineRun = true;
+    else if (arg === '--baseline-refresh') args.baselineRefresh = true;
+    else if (arg === '--baseline-validate') args.baselineValidate = true;
+    else if (arg === '--baseline-seal') args.baselineSeal = true;
+    else if (arg === '--phase42-verify') args.phase42Verify = true;
+    else if (arg === '--setup-index-measure') args.setupIndexMeasure = true;
+    else if (arg === '--setup-index-import') args.setupIndexImport = true;
+    else if (arg === '--quiet') args.quiet = true;
+    else if (arg === '--baseline-validate-arms') args.baselineValidateArms = argv[++i] ?? '';
+    else if (arg === '--baseline-run-codebase-context-arms')
+      args.baselineRunCodebaseContextArms = true;
+    else if (arg === '--print-claude-args') args.printClaudeArgs = true;
+    else if (arg === '--print-answer-schema') args.printAnswerSchema = true;
+    else if (arg === '--dry-run') args.dryRun = true;
+    else if (arg === '--score-probe') args.scoreProbe = true;
+    else if (arg === '--executor') args.executor = argv[++i] ?? '';
+    else if (arg === '--model') args.model = argv[++i] ?? '';
+    else if (arg === '--lane') args.lane = argv[++i] ?? '';
+    else if (arg === '--task-id') args.taskId = argv[++i] ?? '';
+    else if (arg === '--repeat') args.repeat = Number(argv[++i] ?? '1');
+    else if (arg === '--repeats') args.repeats = Number(argv[++i] ?? '1');
+    else if (arg === '--max-attempts') args.maxAttempts = Number(argv[++i] ?? '0');
+    else if (arg === '--timeout-ms') args.timeoutMs = Number(argv[++i] ?? '0');
+    else if (arg === '--task-payloads') args.taskPayloads = argv[++i] ?? '';
+    else if (arg === '--input') args.input = argv[++i] ?? '';
+    else if (arg === '--fake-answer-mode') args.fakeAnswerMode = argv[++i] ?? 'valid';
+    else if (arg === '--all-ready-lanes') args.allReadyLanes = true;
+    else if (arg === '--out') args.out = argv[++i] ?? '';
+    else if (arg === '--session') args.session = argv[++i] ?? '';
+    else throw new Error(`Unknown argument: ${arg}`);
+  }
+  return args;
+}
+
+function stableStringify(value) {
+  if (value === null || typeof value !== 'object') return JSON.stringify(value);
+  if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(',')}]`;
+  const entries = Object.entries(value).sort(([a], [b]) => a.localeCompare(b));
+  return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(',')}}`;
+}
+
+function sha256(value) {
+  return `sha256:${createHash('sha256').update(value, 'utf8').digest('hex')}`;
+}
+
+function canonicalizeDatasetField(value) {
+  if (value === undefined) return 'undefined';
+  if (value === null) return 'null';
+  if (typeof value !== 'string') return stableStringify(value).replace(/\r\n?/g, '\n');
+  const normalized = value.replace(/\r\n?/g, '\n');
+  const trimmed = normalized.trim();
+  if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
+    try {
+      return stableStringify(JSON.parse(trimmed));
+    } catch {
+      return normalized;
+    }
+  }
+  return normalized;
+}
+
+function sha256Buffer(value) {
+  return `sha256:${createHash('sha256').update(value).digest('hex')}`;
+}
+
+function hashFile(filePath) {
+  return sha256Buffer(readFileSync(filePath));
+}
+
+function runnerSourceHash() {
+  return hashFile(RUNNER_SOURCE_PATH);
+}
+
+function hashObject(value) {
+  return sha256(stableStringify(value));
+}
+
+function readJson(filePath) {
+  return JSON.parse(readFileSync(filePath, 'utf8'));
+}
+
+function readTaskPayloads(payloadPath) {
+  if (!payloadPath) return new Map();
+  const absolutePath = isAbsolute(payloadPath) ? payloadPath : resolve(process.cwd(), payloadPath);
+  const payload = readJson(absolutePath);
+  const entries = Array.isArray(payload?.tasks)
+    ? payload.tasks
+    : Object.entries(payload?.tasksById ?? payload ?? {}).map(([instanceId, value]) => ({
+        instance_id: instanceId,
+        ...value
+      }));
+  return new Map(
+    entries
+      .filter((entry) => entry && typeof entry.instance_id === 'string')
+      .map((entry) => [entry.instance_id, entry])
+  );
+}
+
+function executorCommandIsOverridden(executor) {
+  if (executor === 'claude') return Boolean(process.env.CONTEXTBENCH_CLAUDE_COMMAND);
+  if (executor === 'codex') return Boolean(process.env.CONTEXTBENCH_CODEX_COMMAND);
+  if (executor === 'gemini') return Boolean(process.env.CONTEXTBENCH_GEMINI_COMMAND);
+  if (executor === 'opencode') return Boolean(process.env.CONTEXTBENCH_OPENCODE_COMMAND);
+  return false;
+}
+
+function gitOutput(cwd, args) {
+  try {
+    return execFileSync('git', ['-c', 'core.longpaths=true', '-c', 'core.autocrlf=false', ...args], {
+      cwd,
+      encoding: 'utf8',
+      input: '',
+      stdio: ['pipe', 'pipe', 'ignore']
+    }).trim();
+  } catch {
+    return null;
+  }
+}
+
+function resolveTaskContext(task, payloads, executor) {
+  if (executor === 'fake') return { materialized: false, errors: [] };
+  const payload = payloads.get(task.instance_id);
+  const errors = [];
+  if (!payload) errors.push('missing_task_payload');
+  const problemStatement =
+    typeof payload?.problem_statement === 'string' ? payload.problem_statement : '';
+  if (!problemStatement.trim()) errors.push('missing_problem_statement');
+  const repoCheckoutPath =
+    typeof payload?.repo_checkout_path === 'string' ? payload.repo_checkout_path : '';
+  if (!repoCheckoutPath.trim()) errors.push('missing_repo_checkout_path');
+  const absoluteCheckoutPath = repoCheckoutPath
+    ? isAbsolute(repoCheckoutPath)
+      ? repoCheckoutPath
+      : resolve(process.cwd(), repoCheckoutPath)
+    : '';
+  if (absoluteCheckoutPath && !existsSync(absoluteCheckoutPath))
+    errors.push('repo_checkout_missing');
+  const actualHead = absoluteCheckoutPath
+    ? gitOutput(absoluteCheckoutPath, ['rev-parse', 'HEAD'])
+    : null;
+  const statusShort = absoluteCheckoutPath
+    ? gitOutput(absoluteCheckoutPath, ['status', '--short'])
+    : null;
+  const remoteUrl = absoluteCheckoutPath
+    ? gitOutput(absoluteCheckoutPath, ['remote', 'get-url', 'origin'])
+    : null;
+  if (absoluteCheckoutPath && !actualHead) errors.push('repo_checkout_not_git');
+  if (actualHead && statusShort) errors.push('repo_checkout_dirty');
+  const problemStatementHash = problemStatement
+    ? sha256(canonicalizeDatasetField(problemStatement))
+    : null;
+  const overridden = executorCommandIsOverridden(executor);
+  const problemStatementHashVerified = problemStatementHash === task.problem_statement_hash;
+  const baseCommitVerified = actualHead === task.base_commit;
+  if (!overridden && problemStatement && !problemStatementHashVerified)
+    errors.push('problem_statement_hash_mismatch');
+  if (!overridden && actualHead && !baseCommitVerified) errors.push('base_commit_mismatch');
+  return {
+    materialized: errors.length === 0,
+    errors,
+    problemStatement,
+    problemStatementHash,
+    problemStatementHashVerified,
+    repoCheckoutPath: absoluteCheckoutPath || null,
+    actualHead,
+    statusShort,
+    baseCommitVerified,
+    remoteUrl,
+    verificationStrict: !overridden
+  };
+}
+
+function writeJson(filePath, value) {
+  mkdirSync(dirname(filePath), { recursive: true });
+  writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8');
+}
+
+function artifactEntry(filePath, rootDir) {
+  const stats = statSync(filePath);
+  return {
+    path: normalizePath(relative(rootDir, filePath)),
+    hash: hashFile(filePath),
+    bytes: stats.size
+  };
+}
+
+function writeTextArtifact(filePath, value) {
+  mkdirSync(dirname(filePath), { recursive: true });
+  writeFileSync(filePath, value, 'utf8');
+}
+
+function sanitize(value) {
+  return value
+    .replace(/[^a-zA-Z0-9._-]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+    .slice(0, 160);
+}
+
+function loadFixtures() {
+  return {
+    protocol: readJson(FIXTURES.protocol),
+    lanes: readJson(FIXTURES.lanes),
+    corrections: readJson(FIXTURES.corrections),
+    manifest: readJson(FIXTURES.manifest),
+    laneToolCards: readJson(FIXTURES.laneToolCards),
+    laneSetupEvidence: readJson(FIXTURES.laneSetupEvidence)
+  };
+}
+
+function hasPendingPhase39Placeholder(card) {
+  return [card.setupCommand, card.indexCommand, card.queryCommand, card.versionCommand].some(
+    (command) => String(command).toLowerCase().includes('pending phase 39')
+  );
+}
+
+function validateCommandEvidence(record, errors) {
+  const commandKinds = new Set(record.commands?.map((command) => command.kind));
+  for (const kind of ['setup', 'index', 'query', 'version']) {
+    if (!commandKinds.has(kind))
+      errors.push(`lane ${record.laneId} missing ${kind} command evidence`);
+  }
+  for (const command of record.commands ?? []) {
+    if (!command.command || !command.cwd || !command.status) {
+      errors.push(
+        `lane ${record.laneId} has incomplete ${command.kind ?? 'unknown'} command evidence`
+      );
+    }
+    if (
+      command.durationMs !== null &&
+      (!Number.isFinite(command.durationMs) || command.durationMs < 0)
+    ) {
+      errors.push(`lane ${record.laneId} has invalid ${command.kind} duration`);
+    }
+  }
+}
+
+function validateTerminalBlockedEvidence(record, errors) {
+  if (!record.logReference && !(record.commands ?? []).some((command) => command.outputHash)) {
+    errors.push(
+      `lane ${record.laneId} blocked/failed evidence needs a log reference or output hash`
+    );
+  }
+  if (!record.nextHumanAction || record.nextHumanAction.length < 20) {
+    errors.push(`lane ${record.laneId} blocked/failed evidence needs next human action`);
+  }
+  const hasBlockedCommand = (record.commands ?? []).some((command) =>
+    ['blocked', 'failed'].includes(command.status)
+  );
+  if (!hasBlockedCommand)
+    errors.push(`lane ${record.laneId} blocked/failed evidence needs blocked or failed command`);
+}
+
+function validateLaneSetupEvidence(fixtures = loadFixtures()) {
+  const errors = [];
+  if (fixtures.laneSetupEvidence.claimBearing !== false)
+    errors.push('lane setup evidence must be non-claim-bearing');
+  if (
+    !String(fixtures.laneSetupEvidence.generatedOutputsPolicy ?? '').includes(
+      'not Phase 40 baseline artifacts'
+    )
+  ) {
+    errors.push(
+      'lane setup evidence must keep generated outputs outside Phase 40 baseline artifacts'
+    );
+  }
+
+  const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card]));
+  const evidenceByLane = new Map(
+    fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
+  );
+
+  for (const lane of fixtures.lanes.lanes) {
+    const card = cardsByLane.get(lane.laneId);
+    const record = evidenceByLane.get(lane.laneId);
+    if (!card) {
+      errors.push(`missing lane tool card for ${lane.laneId}`);
+      continue;
+    }
+    if (!record) {
+      errors.push(`missing lane setup evidence for ${lane.laneId}`);
+      continue;
+    }
+    if (record.readinessStatus === 'pending') errors.push(`lane ${lane.laneId} remains pending`);
+    if (!TERMINAL_LANE_SETUP_STATUSES.has(record.readinessStatus)) {
+      errors.push(`lane ${lane.laneId} has non-terminal setup status ${record.readinessStatus}`);
+    }
+    if (card.phase39Status !== record.readinessStatus) {
+      errors.push(`lane ${lane.laneId} card/evidence status mismatch`);
+    }
+    if (
+      hasPendingPhase39Placeholder(card) &&
+      !BLOCKED_LANE_SETUP_STATUSES.has(record.readinessStatus)
+    ) {
+      errors.push(
+        `lane ${lane.laneId} has unresolved pending Phase 39 command without terminal blocker evidence`
+      );
+    }
+    if (hasPendingPhase39Placeholder(card))
+      errors.push(`lane ${lane.laneId} still has pending Phase 39 command text`);
+    if (record.claimBearing !== false)
+      errors.push(`lane ${lane.laneId} setup evidence must be non-claim-bearing`);
+    if (
+      lane.laneId !== 'raw-native' &&
+      (card.contextTools.length !== 1 || card.allowedTools.length !== 1)
+    ) {
+      errors.push(`lane ${lane.laneId} must expose exactly one context tool`);
+    }
+    if (lane.laneId !== 'raw-native') {
+      for (const nativeTool of ['native-read', 'native-search', 'native-shell-readonly']) {
+        if (!card.disallowedTools.includes(nativeTool))
+          errors.push(`lane ${lane.laneId} must disallow ${nativeTool}`);
+      }
+    }
+    if (card.setupCostReportedSeparately !== true || card.indexCostReportedSeparately !== true) {
+      errors.push(`lane ${lane.laneId} must separate setup/index cost`);
+    }
+    if ('taskWallTimeMs' in record)
+      errors.push(`lane ${lane.laneId} setup evidence must not include task wall time`);
+    if (
+      record.setupDurationMs !== null &&
+      (!Number.isFinite(record.setupDurationMs) || record.setupDurationMs < 0)
+    ) {
+      errors.push(`lane ${lane.laneId} has invalid setup duration`);
+    }
+    if (
+      record.indexDurationMs !== null &&
+      (!Number.isFinite(record.indexDurationMs) || record.indexDurationMs < 0)
+    ) {
+      errors.push(`lane ${lane.laneId} has invalid index duration`);
+    }
+    validateCommandEvidence(record, errors);
+    if (BLOCKED_LANE_SETUP_STATUSES.has(record.readinessStatus))
+      validateTerminalBlockedEvidence(record, errors);
+  }
+
+  if (errors.length > 0) throw new Error(`lane setup validation failed:\n- ${errors.join('\n- ')}`);
+  return fixtures;
+}
+
+function validateFixtures() {
+  const fixtures = loadFixtures();
+  const errors = [];
+  const manifestWithoutHash = { ...fixtures.manifest };
+  delete manifestWithoutHash.manifest_hash;
+  if (fixtures.manifest.manifest_hash !== hashObject(manifestWithoutHash))
+    errors.push('task manifest hash mismatch');
+  if (fixtures.manifest.tasks.length !== 20)
+    errors.push('task manifest must contain exactly 20 tasks');
+  if (fixtures.protocol.claimAllowed !== false)
+    errors.push('protocol claimAllowed must remain false');
+  if (!fixtures.protocol.benchmarkTarget.officialEvaluatorFirst)
+    errors.push('official evaluator must be first');
+  if (!fixtures.protocol.budgets.setupAndIndexingReportedSeparately)
+    errors.push('setup/indexing must be separate');
+
+  const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card]));
+  for (const laneId of fixtures.lanes.broadClaimLaneSet) {
+    if (!cardsByLane.has(laneId)) errors.push(`missing lane tool card for ${laneId}`);
+  }
+  for (const lane of fixtures.lanes.lanes) {
+    const card = cardsByLane.get(lane.laneId);
+    if (!card) continue;
+    for (const field of fixtures.lanes.laneToolCardRequiredFields) {
+      if (card[field] === undefined || card[field] === '')
+        errors.push(`lane ${lane.laneId} missing ${field}`);
+    }
+    if (card.setupCostReportedSeparately !== true || card.indexCostReportedSeparately !== true) {
+      errors.push(`lane ${lane.laneId} must separate setup/index cost`);
+    }
+    if (card.disallowedTools.includes(lane.contextTool))
+      errors.push(`lane ${lane.laneId} disallows its own context tool`);
+    if (lane.laneId !== 'raw-native' && card.contextTools.length !== 1)
+      errors.push(`lane ${lane.laneId} must expose one context tool`);
+    if (lane.laneId !== 'raw-native' && card.allowedTools.length !== 1)
+      errors.push(`lane ${lane.laneId} must allow only its context tool`);
+    if (lane.phase36Status === 'deferred_to_phase39' && card.executableInPhase38) {
+      errors.push(`lane ${lane.laneId} must remain pending Phase 39`);
+    }
+  }
+  for (const status of fixtures.protocol.failureTaxonomy) {
+    if (!fixtures.protocol.runManifestSchema.terminalStatuses.includes(status)) {
+      errors.push(`failure status ${status} missing from terminal statuses`);
+    }
+  }
+  if (errors.length > 0) throw new Error(`fixture validation failed:\n- ${errors.join('\n- ')}`);
+  validateLaneSetupEvidence(fixtures);
+  return fixtures;
+}
+
+function normalizePath(filePath) {
+  return filePath.replace(/\\/g, '/').replace(/^\.\//, '').replace(/^\/+/, '');
+}
+
+function isPathInside(parentPath, candidatePath) {
+  const relativePath = relative(parentPath, candidatePath);
+  return relativePath === '' || (!relativePath.startsWith('..') && !isAbsolute(relativePath));
+}
+
+function buildTrajectory(task, answer) {
+  const spans = {};
+  const files = new Set();
+  for (const evidence of answer.evidence) {
+    const file = normalizePath(evidence.file);
+    files.add(file);
+    spans[file] = [
+      ...(spans[file] ?? []),
+      { start: evidence.lineRange.start, end: evidence.lineRange.end, full_file: false }
+    ];
+  }
+  for (const fileRef of answer.filesReferenced) {
+    const file = normalizePath(fileRef);
+    files.add(file);
+    if (!spans[file]) spans[file] = [{ start: 1, end: null, full_file: true }];
+  }
+  const predFiles = [...files].sort();
+  return {
+    instance_id: task.instance_id,
+    repo_url: task.repo_url,
+    commit: task.base_commit,
+    traj_data: {
+      pred_steps: [{ files: predFiles, spans }],
+      pred_files: predFiles,
+      pred_spans: spans
+    },
+    model_patch: ''
+  };
+}
+
+function baselineSessionPhase(sessionRoot) {
+  const resolved = resolve(sessionRoot);
+  const normalized = normalizePath(resolved);
+  const match = normalized.match(/\/benchmark-runs\/contextbench\/phase(40|41)\//);
+  if (!match) {
+    throw new Error(
+      'Phase 40/41 baseline artifacts must be written under benchmark-runs/contextbench/phase40/<session_id> or benchmark-runs/contextbench/phase41/<session_id>'
+    );
+  }
+  if (normalized.includes('/outputs/')) {
+    throw new Error('Phase 40/41 baseline artifacts must not be written under outputs/');
+  }
+  return Number(match[1]);
+}
+
+function ensureBaselineSessionRoot(sessionRoot) {
+  const resolved = resolve(sessionRoot);
+  baselineSessionPhase(resolved);
+  return resolved;
+}
+
+function commandLabel(command, args = []) {
+  return [command, ...args].join(' ');
+}
+
+function safeExec(command, args = []) {
+  try {
+    return execFileSync(command, args, {
+      encoding: 'utf8',
+      stdio: ['ignore', 'pipe', 'pipe']
+    }).trim();
+  } catch (error) {
+    const stderr =
+      error && typeof error === 'object' && 'stderr' in error ? String(error.stderr) : '';
+    return stderr.trim() || 'unavailable';
+  }
+}
+
+function captureCommand(command, args, cwd, logsDir, label) {
+  const startedAt = Date.now();
+  const result = spawnSync(command, args, { cwd, encoding: 'utf8', input: '' });
+  const durationMs = Date.now() - startedAt;
+  const stdoutPath = join(logsDir, `${label}.stdout.log`);
+  const stderrPath = join(logsDir, `${label}.stderr.log`);
+  writeTextArtifact(stdoutPath, result.stdout ?? '');
+  writeTextArtifact(stderrPath, result.stderr ?? '');
+  return {
+    command: commandLabel(command, args),
+    cwd,
+    exitCode: typeof result.status === 'number' ? result.status : null,
+    durationMs,
+    stdoutPath,
+    stderrPath,
+    outputHash: sha256(`${result.stdout ?? ''}\n${result.stderr ?? ''}`)
+  };
+}
+
+function fixtureHashes() {
+  return Object.fromEntries(
+    Object.entries(FIXTURES)
+      .filter(([, filePath]) => existsSync(filePath))
+      .map(([name, filePath]) => [name, hashFile(filePath)])
+  );
+}
+
+function redactedEnvVarNames() {
+  return Object.keys(process.env)
+    .filter((name) => /TOKEN|KEY|SECRET|PASSWORD|AUTH|OPENAI|ANTHROPIC|CLAUDE/i.test(name))
+    .sort();
+}
+
+function versionSnapshot() {
+  return {
+    os: `${os.platform()} ${os.release()}`,
+    arch: os.arch(),
+    shell: process.env.SHELL ?? process.env.ComSpec ?? 'unknown',
+    node: process.version,
+    npm: safeExec('npm', ['--version']),
+    pnpm: safeExec('pnpm', ['--version']),
+    git: safeExec('git', ['--version']),
+    python: safeExec('python', ['--version']),
+    uv: safeExec('uv', ['--version']),
+    claude: safeExec('claude', ['--version'])
+  };
+}
+
+function listFilesRecursive(rootDir) {
+  if (!existsSync(rootDir)) return [];
+  const entries = [];
+  for (const name of readdirSync(rootDir)) {
+    const filePath = join(rootDir, name);
+    const stats = statSync(filePath);
+    if (stats.isDirectory()) entries.push(...listFilesRecursive(filePath));
+    else entries.push(filePath);
+  }
+  return entries;
+}
+
+function shouldExcludeUntracked(filePath, bytes) {
+  const normalized = normalizePath(filePath);
+  if (normalized.startsWith('benchmark-runs/')) return 'generated_phase40_or_benchmark_output';
+  if (normalized.startsWith('outputs/')) return 'generated_output_path';
+  if (normalized.startsWith('node_modules/') || normalized.includes('/node_modules/'))
+    return 'dependency_cache';
+  if (normalized.startsWith('.pnpm-store/') || normalized.includes('/.pnpm-store/'))
+    return 'dependency_cache';
+  if (normalized.startsWith('.git/') || normalized.includes('/.git/')) return 'git_internal';
+  if (normalized.startsWith('.playwright-mcp/') || normalized.includes('/.playwright-mcp/'))
+    return 'tool_cache';
+  if (bytes > 256 * 1024) return 'large_untracked_file';
+  return null;
+}
+
+function parseUntrackedFromStatus(statusText) {
+  return statusText
+    .split('\n')
+    .filter((line) => line.startsWith('? '))
+    .map((line) => line.slice(2).trim())
+    .filter(Boolean);
+}
+
+function captureUntrackedEntries(statusText, repoRoot) {
+  return parseUntrackedFromStatus(statusText).map((filePath) => {
+    const absolutePath = resolve(repoRoot, filePath);
+    if (!existsSync(absolutePath)) {
+      return {
+        path: normalizePath(filePath),
+        bytes: null,
+        mtimeMs: null,
+        hash: null,
+        disposition: 'excluded',
+        exclusionReason: 'missing_at_snapshot_time'
+      };
+    }
+    const stats = statSync(absolutePath);
+    if (!stats.isFile()) {
+      return {
+        path: normalizePath(filePath),
+        bytes: stats.size,
+        mtimeMs: stats.mtimeMs,
+        hash: null,
+        disposition: 'excluded',
+        exclusionReason: 'not_regular_file'
+      };
+    }
+    const exclusionReason = shouldExcludeUntracked(filePath, stats.size);
+    if (exclusionReason) {
+      return {
+        path: normalizePath(filePath),
+        bytes: stats.size,
+        mtimeMs: stats.mtimeMs,
+        hash: null,
+        disposition: 'excluded',
+        exclusionReason
+      };
+    }
+    return {
+      path: normalizePath(filePath),
+      bytes: stats.size,
+      mtimeMs: stats.mtimeMs,
+      hash: hashFile(absolutePath),
+      disposition: 'hashed',
+      exclusionReason: null
+    };
+  });
+}
+
+function lockfileArtifacts(repoRoot, sessionRoot) {
+  const lockfiles = ['pnpm-lock.yaml', 'package-lock.json', 'yarn.lock', 'bun.lockb'];
+  return lockfiles
+    .map((name) => resolve(repoRoot, name))
+    .filter((filePath) => existsSync(filePath))
+    .map((filePath) => ({
+      path: normalizePath(relative(sessionRoot, filePath)),
+      hash: hashFile(filePath),
+      bytes: statSync(filePath).size
+    }));
+}
+
+function runGitCapture(args, repoRoot, logsDir, label) {
+  const captured = captureCommand('git', args, repoRoot, logsDir, label);
+  const stdout = readFileSync(captured.stdoutPath, 'utf8');
+  return { ...captured, stdout };
+}
+
+function createReservations(fixtures) {
+  const repeats =
+    fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ??
+    fixtures.protocol.thresholds?.claimBearingRunsPerTaskLane ??
+    3;
+  const evidenceByLane = new Map(
+    fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
+  );
+  const reservations = [];
+  for (const task of fixtures.manifest.tasks) {
+    for (const laneId of fixtures.lanes.broadClaimLaneSet) {
+      const evidence = evidenceByLane.get(laneId);
+      const blocked = evidence && BLOCKED_LANE_SETUP_STATUSES.has(evidence.readinessStatus);
+      for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) {
+        reservations.push({
+          laneId,
+          taskId: task.instance_id,
+          repeatIndex,
+          status: blocked ? 'terminal_missing_evidence' : 'reserved',
+          terminalStatus: blocked ? 'setup_failed' : null,
+          reason: blocked ? evidence.readinessStatus : null
+        });
+      }
+    }
+  }
+  return reservations;
+}
+
+function buildRunPaths(sessionRoot, runId) {
+  const runDir = join(sessionRoot, 'runs', runId);
+  return {
+    runDir,
+    prompt: join(runDir, 'prompt.txt'),
+    laneCard: join(runDir, 'lane-card.json'),
+    setupIndex: join(runDir, 'setup-index.json'),
+    rawTrace: join(runDir, 'raw-trace.json'),
+    structuredAnswer: join(runDir, 'structured-answer.json'),
+    trajectory: join(runDir, 'trajectory.json'),
+    score: join(runDir, 'score.json'),
+    manifest: join(sessionRoot, 'run-manifest.jsonl')
+  };
+}
+
+function buildSetupIndexMeasurementPaths(sessionRoot, laneId) {
+  const root = join(sessionRoot, 'setup-index', laneId);
+  const logs = join(root, 'logs');
+  return {
+    root,
+    logs,
+    artifact: join(root, 'setup-index.json'),
+    setupStdout: join(logs, 'setup.stdout.log'),
+    setupStderr: join(logs, 'setup.stderr.log'),
+    indexStdout: join(logs, 'index.stdout.log'),
+    indexStderr: join(logs, 'index.stderr.log')
+  };
+}
+
+function artifactHashesForPaths(paths) {
+  return {
+    prompt: hashFile(paths.prompt),
+    laneToolCard: hashFile(paths.laneCard),
+    setupIndex: hashFile(paths.setupIndex),
+    rawTrace: hashFile(paths.rawTrace),
+    structuredAnswer: hashFile(paths.structuredAnswer),
+    trajectory: hashFile(paths.trajectory),
+    score: hashFile(paths.score),
+    runnerSourceHash: runnerSourceHash()
+  };
+}
+
+function optionalHashFile(filePath) {
+  return existsSync(filePath) ? hashFile(filePath) : null;
+}
+
+function commandEvidenceForKind(record, kind) {
+  return (record?.commands ?? []).find((command) => command.kind === kind) ?? null;
+}
+
+function outputHashForLogs(stdoutPath, stderrPath) {
+  return sha256(`${readFileSync(stdoutPath, 'utf8')}\n${readFileSync(stderrPath, 'utf8')}`);
+}
+
+function normalizeMeasurementLogPath(sessionRoot, filePath) {
+  if (!filePath) return null;
+  return isAbsolute(filePath) ? filePath : join(sessionRoot, filePath);
+}
+
+function validateMeasuredSetupIndex(sessionRoot, laneCard, measurement) {
+  const errors = [];
+  if (!measurement || typeof measurement !== 'object') errors.push('measurement must be an object');
+  if (measurement?.laneId !== laneCard.laneId) errors.push('measurement laneId mismatch');
+  if (measurement?.claimBearing !== false) errors.push('measurement must be non-claim-bearing');
+  const setupStatus = measurement?.setupStatus;
+  const indexStatus = measurement?.indexStatus;
+  if (!['completed', 'not_required', 'setup_failed'].includes(setupStatus))
+    errors.push('invalid setupStatus');
+  if (!['completed', 'not_required', 'index_failed'].includes(indexStatus))
+    errors.push('invalid indexStatus');
+  for (const [field, status] of [
+    ['setupDurationMs', setupStatus],
+    ['indexDurationMs', indexStatus]
+  ]) {
+    const duration = measurement?.[field];
+    if (typeof duration !== 'number' || !Number.isFinite(duration) || duration < 0) {
+      errors.push(`${field} must be a finite non-negative number`);
+    }
+    if (status === 'completed' && duration <= 0) errors.push(`${field} must be positive when completed`);
+  }
+  for (const field of ['setupLogPath', 'indexLogPath']) {
+    const logPath = normalizeMeasurementLogPath(sessionRoot, measurement?.[field]);
+    if (!logPath) {
+      errors.push(`${field} missing`);
+      continue;
+    }
+    if (!isPathInside(sessionRoot, logPath)) {
+      errors.push(`${field} must stay inside session root`);
+    } else if (!existsSync(logPath)) {
+      errors.push(`${field} missing artifact`);
+    }
+  }
+  return errors;
+}
+
+function rowSetupIndexFromMeasurement(measurement) {
+  return {
+    setupCommand: measurement.setupCommand,
+    indexCommand: measurement.indexCommand,
+    setupDurationMs: measurement.setupDurationMs,
+    indexDurationMs: measurement.indexDurationMs,
+    setupLogPath: measurement.setupLogPath,
+    indexLogPath: measurement.indexLogPath,
+    setupStatus: measurement.setupStatus,
+    indexStatus: measurement.indexStatus
+  };
+}
+
+function readMeasuredSetupIndex(sessionRoot, laneCard) {
+  const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId);
+  if (!existsSync(paths.artifact)) return null;
+  const measurement = readJson(paths.artifact);
+  const errors = validateMeasuredSetupIndex(sessionRoot, laneCard, measurement);
+  if (errors.length > 0) throw new Error(`setup/index measurement invalid for ${laneCard.laneId}:\n- ${errors.join('\n- ')}`);
+  return rowSetupIndexFromMeasurement(measurement);
+}
+
+function defaultRawNativeSetupIndex(sessionRoot, laneCard) {
+  const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId);
+  mkdirSync(paths.logs, { recursive: true });
+  writeTextArtifact(paths.setupStdout, 'raw-native setup not required\n');
+  writeTextArtifact(paths.setupStderr, '');
+  writeTextArtifact(paths.indexStdout, 'raw-native index not required\n');
+  writeTextArtifact(paths.indexStderr, '');
+  return {
+    laneId: laneCard.laneId,
+    claimBearing: false,
+    measuredAt: new Date().toISOString(),
+    measurementMode: 'not_required',
+    setupCommand: laneCard.setupCommand,
+    indexCommand: laneCard.indexCommand,
+    setupDurationMs: 0,
+    indexDurationMs: 0,
+    setupLogPath: paths.setupStdout,
+    indexLogPath: paths.indexStdout,
+    setupStatus: 'not_required',
+    indexStatus: 'not_required',
+    commands: [
+      {
+        kind: 'setup',
+        command: laneCard.setupCommand,
+        executed: false,
+        exitCode: 0,
+        durationMs: 0,
+        stdoutLogPath: paths.setupStdout,
+        stderrLogPath: paths.setupStderr,
+        outputHash: outputHashForLogs(paths.setupStdout, paths.setupStderr)
+      },
+      {
+        kind: 'index',
+        command: laneCard.indexCommand,
+        executed: false,
+        exitCode: 0,
+        durationMs: 0,
+        stdoutLogPath: paths.indexStdout,
+        stderrLogPath: paths.indexStderr,
+        outputHash: outputHashForLogs(paths.indexStdout, paths.indexStderr)
+      }
+    ]
+  };
+}
+
+function laneTelemetryOverrides() {
+  const raw = process.env.CONTEXTBENCH_LANE_TELEMETRY_JSON;
+  if (!raw) return {};
+  try {
+    const parsed = JSON.parse(raw);
+    return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {};
+  } catch {
+    throw new Error('CONTEXTBENCH_LANE_TELEMETRY_JSON must be a JSON object');
+  }
+}
+
+function buildLaneIsolationEvidence(laneCard) {
+  const telemetry = laneTelemetryOverrides()[laneCard.laneId];
+  const observedTools = Array.isArray(telemetry?.observedTools)
+    ? telemetry.observedTools.filter((tool) => typeof tool === 'string')
+    : [];
+  const disallowedObserved = observedTools.filter((tool) => laneCard.disallowedTools.includes(tool));
+  const unknownObserved = observedTools.filter((tool) => !laneCard.allowedTools.includes(tool));
+  const expectedContextTool = laneCard.contextTools[0] ?? laneCard.laneId;
+  const rawNative = laneCard.laneId === 'raw-native';
+  const expectedObserved = rawNative
+    ? observedTools.length > 0 && unknownObserved.length === 0
+    : observedTools.length === 1 && observedTools[0] === expectedContextTool;
+  const violations = [...disallowedObserved, ...unknownObserved].map((tool) => `unexpected_tool_${tool}`);
+  const proven = Boolean(telemetry?.proofSource) && expectedObserved && violations.length === 0;
+  return {
+    laneId: laneCard.laneId,
+    proven,
+    sourceKind: telemetry?.proofSource ? 'env_override' : 'not_captured',
+    proofSource: typeof telemetry?.proofSource === 'string' ? telemetry.proofSource : 'not_captured',
+    expectedContextTool,
+    allowedTools: laneCard.allowedTools,
+    disallowedTools: laneCard.disallowedTools,
+    observedTools,
+    violations
+  };
+}
+
+function runOfficialEvaluatorForAttempt(fixtures, paths, task, executor, status) {
+  if (executor === 'fake') {
+    return {
+      status,
+      mode: 'diagnostic_fallback',
+      ...diagnosticFallbackScoring(fixtures, 'fake_executor_smoke_only')
+    };
+  }
+  if (status !== 'completed') {
+    return {
+      status,
+      mode: 'diagnostic_fallback',
+      ...diagnosticFallbackScoring(fixtures, 'agent_attempt_not_completed')
+    };
+  }
+  if (executorCommandIsOverridden(executor) && !process.env.CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND) {
+    return {
+      status,
+      mode: 'diagnostic_fallback',
+      ...diagnosticFallbackScoring(fixtures, 'overridden_executor_smoke_no_official_evaluator')
+    };
+  }
+
+  const officialGoldPath = join(paths.runDir, 'official-gold-input.json');
+  const officialOutputPath = join(paths.runDir, 'official-results.jsonl');
+  const stdoutPath = join(paths.runDir, 'official-evaluator.stdout.log');
+  const stderrPath = join(paths.runDir, 'official-evaluator.stderr.log');
+  writeJson(officialGoldPath, {
+    instance_id: task.instance_id,
+    gold_context_ref: task.gold_context_ref,
+    gold_context_hash: task.gold_context_hash,
+    hash_canonicalization_version: task.hash_canonicalization_version
+  });
+
+  const evaluator = officialEvaluatorCommandParts();
+  const evaluatorArgs = [
+    ...evaluator.prefixArgs,
+    '--gold',
+    officialGoldPath,
+    '--pred',
+    paths.trajectory,
+    '--out',
+    officialOutputPath
+  ];
+  const result = spawnSync(evaluator.command, evaluatorArgs, {
+    encoding: 'utf8',
+    cwd: paths.runDir,
+    timeout: fixtures.protocol.budgets.defaults.timeoutSeconds * 1000
+  });
+  const stdout = result.stdout ?? '';
+  const stderr = result.stderr ?? '';
+  writeTextArtifact(stdoutPath, stdout);
+  writeTextArtifact(stderrPath, stderr);
+  const exitStatus = typeof result.status === 'number' ? result.status : null;
+  const command = `${evaluator.command} ${evaluatorArgs.join(' ')}`;
+  const outputValidation = validateOfficialEvaluatorOutputEnvelope(officialOutputPath, task);
+  if (exitStatus === 0 && outputValidation.valid) {
+    return {
+      status: 'completed',
+      mode: 'official_evaluator',
+      officialEvaluatorFirst: true,
+      officialEvaluatorAttempted: true,
+      officialEvaluatorInvoked: true,
+      command,
+      claimBearing: fixtures.protocol.claimAllowed === true,
+      stdoutPath,
+      stderrPath,
+      outputPath: officialOutputPath,
+      outputHash: hashFile(officialOutputPath),
+      stdoutHash: hashFile(stdoutPath),
+      stderrHash: hashFile(stderrPath),
+      exitCode: exitStatus,
+      exitStatus
+    };
+  }
+  return {
+    status: 'judge_failed',
+    mode: 'diagnostic_fallback',
+    officialEvaluatorFirst: true,
+    officialEvaluatorAttempted: true,
+    officialEvaluatorInvoked: true,
+    command,
+    claimBearing: false,
+    fallbackReason: outputValidation.reason ?? 'official_evaluator_failed',
+    stdoutPath,
+    stderrPath,
+    outputPath: officialOutputPath,
+    outputHash: optionalHashFile(officialOutputPath),
+    stdoutHash: hashFile(stdoutPath),
+    stderrHash: hashFile(stderrPath),
+    exitCode: exitStatus,
+    exitStatus,
+    spawnError: result.error?.message ?? null
+  };
+}
+
+function validateOfficialEvaluatorOutputEnvelope(outputPath, task) {
+  if (!existsSync(outputPath)) return { valid: false, reason: 'official_evaluator_missing_output' };
+  const content = readFileSync(outputPath, 'utf8');
+  if (!content.trim()) return { valid: false, reason: 'official_evaluator_empty_output' };
+  const lines = content.split(/\r?\n/).filter((line) => line.trim().length > 0);
+  const expectedTaskIds = new Set([task.instance_id, task.original_inst_id].filter(Boolean));
+  for (const line of lines) {
+    let parsed;
+    try {
+      parsed = JSON.parse(line);
+    } catch {
+      return { valid: false, reason: 'official_evaluator_malformed_jsonl' };
+    }
+    if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
+      return { valid: false, reason: 'official_evaluator_non_object_jsonl' };
+    }
+    const declaredTaskId = parsed.instance_id ?? parsed.task_id ?? parsed.taskId ?? parsed.id;
+    if (typeof declaredTaskId === 'string' && expectedTaskIds.size > 0 && !expectedTaskIds.has(declaredTaskId)) {
+      return { valid: false, reason: 'official_evaluator_task_mismatch' };
+    }
+  }
+  return { valid: true, reason: null };
+}
+
+function appendRunManifestRow(sessionRoot, row) {
+  appendFileSync(join(sessionRoot, 'run-manifest.jsonl'), `${JSON.stringify(row)}\n`, 'utf8');
+}
+
+function buildManifestRowForArtifacts(params) {
+  return {
+    run_id: params.runId,
+    protocol_version: params.fixtures.protocol.protocolVersion,
+    protocol_hash: hashObject(params.fixtures.protocol),
+    task_manifest_hash: params.fixtures.manifest.manifest_hash,
+    lane_id: params.laneCard.laneId,
+    task_id: params.task.instance_id,
+    repeat_index: params.repeatIndex,
+    status: params.status,
+    started_at: params.startedAt,
+    completed_at: params.completedAt,
+    raw_trace_path: params.paths.rawTrace,
+    structured_answer_path: params.paths.structuredAnswer,
+    trajectory_path: params.paths.trajectory,
+    score_path: params.paths.score,
+    setup_index_path: params.paths.setupIndex,
+    prompt_path: params.paths.prompt,
+    lane_tool_card_path: params.paths.laneCard,
+    setupIndex: params.setupIndex,
+    taskExecution: {
+      model: params.model,
+      timeoutSeconds: params.fixtures.protocol.budgets.defaults.timeoutSeconds,
+      maxContextTokens: params.fixtures.protocol.budgets.defaults.maxContextTokens,
+      maxAnswerTokens: params.fixtures.protocol.budgets.defaults.maxAnswerTokens,
+      startedAt: params.startedAt,
+      completedAt: params.completedAt,
+      taskWallTimeMs: new Date(params.completedAt).getTime() - new Date(params.startedAt).getTime(),
+      executor: params.executor
+    },
+    scoring: params.scoring,
+    hashes: artifactHashesForPaths(params.paths)
+  };
+}
+
+function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
+  const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card]));
+  const evidenceByLane = new Map(
+    fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
+  );
+  const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task]));
+  for (const reservation of reservations.filter(
+    (slot) => slot.status === 'terminal_missing_evidence'
+  )) {
+    const laneCard = cardsByLane.get(reservation.laneId);
+    const task = tasksById.get(reservation.taskId);
+    const evidence = evidenceByLane.get(reservation.laneId);
+    if (!laneCard || !task || !evidence) continue;
+    const runId = sanitize(
+      `${laneCard.laneId}-${task.instance_id}-${reservation.repeatIndex}-missing-evidence`
+    );
+    const paths = buildRunPaths(sessionRoot, runId);
+    const startedAt = new Date().toISOString();
+    const completedAt = startedAt;
+    const setupIndex = {
+      setupCommand: laneCard.setupCommand,
+      indexCommand: laneCard.indexCommand,
+      setupDurationMs: evidence.setupDurationMs ?? 0,
+      indexDurationMs: evidence.indexDurationMs ?? 0,
+      setupLogPath: evidence.logReference ?? paths.setupIndex,
+      indexLogPath: evidence.logReference ?? paths.setupIndex,
+      setupStatus: 'setup_failed',
+      indexStatus: evidence.readinessStatus === 'index_failed' ? 'index_failed' : 'not_required'
+    };
+    const prompt = `Terminal missing evidence for ${task.instance_id} in ${laneCard.laneId}; no agent task prompt executed.`;
+    writeTextArtifact(paths.prompt, prompt);
+    writeJson(paths.laneCard, laneCard);
+    writeJson(paths.setupIndex, { ...setupIndex, evidence });
+    writeJson(paths.rawTrace, {
+      executor: 'none',
+      runnerHash: runnerSourceHash(),
+      claimBearing: false,
+      status: 'setup_failed',
+      laneReadinessStatus: evidence.readinessStatus,
+      reason: reservation.reason,
+      laneIsolation: buildLaneIsolationEvidence(laneCard),
+      scriptedAgentDecisions: false
+    });
+    writeJson(paths.structuredAnswer, {
+      status: 'not_attempted_missing_evidence',
+      claimBearing: false
+    });
+    writeJson(paths.trajectory, { status: 'not_attempted_missing_evidence', pred_files: [] });
+    writeJson(paths.score, {
+      status: 'setup_failed',
+      mode: 'missing_evidence',
+      claimBearing: false,
+      reason: reservation.reason
+    });
+    appendRunManifestRow(
+      sessionRoot,
+      buildManifestRowForArtifacts({
+        runId,
+        fixtures,
+        laneCard,
+        task,
+        repeatIndex: reservation.repeatIndex,
+        status: 'setup_failed',
+        startedAt,
+        completedAt,
+        paths,
+        setupIndex,
+        executor: 'fake',
+        model: 'not-run-missing-evidence',
+        scoring: diagnosticFallbackScoring(
+          fixtures,
+          `terminal_missing_evidence:${reservation.reason}`
+        )
+      })
+    );
+  }
+}
+
+function computeSessionHash(session) {
+  return hashObject({ ...session, sessionHash: '' });
+}
+
+function writeSession(sessionRoot, session) {
+  const nextSession = { ...session, updatedAt: new Date().toISOString() };
+  nextSession.sessionHash = computeSessionHash(nextSession);
+  writeJson(join(sessionRoot, 'BASELINE-SESSION.json'), nextSession);
+  return nextSession;
+}
+
+function readSession(sessionRoot) {
+  return readJson(join(sessionRoot, 'BASELINE-SESSION.json'));
+}
+
+function refreshArtifactIndex(sessionRoot) {
+  return listFilesRecursive(sessionRoot)
+    .filter((filePath) => !filePath.endsWith('BASELINE-SESSION.json'))
+    .map((filePath) => artifactEntry(filePath, sessionRoot))
+    .sort((a, b) => a.path.localeCompare(b.path));
+}
+
+function createBaselineSnapshot(args) {
+  if (!args.out) throw new Error('--baseline-snapshot requires --out <session-root>');
+  const fixtures = validateFixtures();
+  const repoRoot = process.cwd();
+  const sessionRoot = ensureBaselineSessionRoot(args.out);
+  const phase = baselineSessionPhase(sessionRoot);
+  const sessionId = sessionRoot.split(/[\\/]/).filter(Boolean).at(-1) ?? 'phase40-session';
+  const snapshotDir = join(sessionRoot, 'snapshot');
+  const gitDir = join(snapshotDir, 'git');
+  const logsDir = join(snapshotDir, 'commands');
+  mkdirSync(gitDir, { recursive: true });
+  mkdirSync(logsDir, { recursive: true });
+
+  const status = runGitCapture(
+    ['status', '--porcelain=v2', '--branch', '--untracked-files=all'],
+    repoRoot,
+    logsDir,
+    'git-status'
+  );
+  const trackedDiff = runGitCapture(['diff', '--no-ext-diff'], repoRoot, logsDir, 'git-diff');
+  const stagedDiff = runGitCapture(
+    ['diff', '--cached', '--no-ext-diff'],
+    repoRoot,
+    logsDir,
+    'git-diff-staged'
+  );
+  const diffStat = runGitCapture(['diff', '--stat'], repoRoot, logsDir, 'git-diff-stat');
+  const statusPath = join(gitDir, 'status-porcelain-v2.txt');
+  const trackedDiffPath = join(gitDir, 'tracked.diff');
+  const stagedDiffPath = join(gitDir, 'staged.diff');
+  const diffStatPath = join(gitDir, 'diff-stat.txt');
+  writeTextArtifact(statusPath, status.stdout);
+  writeTextArtifact(trackedDiffPath, trackedDiff.stdout);
+  writeTextArtifact(stagedDiffPath, stagedDiff.stdout);
+  writeTextArtifact(diffStatPath, diffStat.stdout);
+
+  const reservations = createReservations(fixtures);
+  const reservationsPath = join(sessionRoot, 'slot-reservations.json');
+  writeJson(reservationsPath, { claimBearing: false, reservations });
+  writeBlockedRunRows(sessionRoot, fixtures, reservations);
+
+  const snapshotWithoutHash = {
+    branch: safeExec('git', ['rev-parse', '--abbrev-ref', 'HEAD']),
+    head: safeExec('git', ['rev-parse', 'HEAD']),
+    divergence: {
+      status: 'unavailable',
+      reason:
+        'Phase 40 plan records main as unavailable locally; divergence is captured as unavailable instead of inferred.'
+    },
+    gitStatusPath: normalizePath(relative(sessionRoot, statusPath)),
+    trackedDiffPath: normalizePath(relative(sessionRoot, trackedDiffPath)),
+    stagedDiffPath: normalizePath(relative(sessionRoot, stagedDiffPath)),
+    diffStatPath: normalizePath(relative(sessionRoot, diffStatPath)),
+    untracked: captureUntrackedEntries(status.stdout, repoRoot),
+    lockfiles: lockfileArtifacts(repoRoot, sessionRoot),
+    redactedEnvVarNames: redactedEnvVarNames(),
+    versions: versionSnapshot(),
+    fixtureHashes: fixtureHashes(),
+    commandTranscript: [status, trackedDiff, stagedDiff, diffStat].map((entry) => ({
+      command: entry.command,
+      cwd: entry.cwd,
+      exitCode: entry.exitCode,
+      stdoutPath: normalizePath(relative(sessionRoot, entry.stdoutPath)),
+      stderrPath: normalizePath(relative(sessionRoot, entry.stderrPath)),
+      outputHash: entry.outputHash
+    }))
+  };
+  const snapshot = { ...snapshotWithoutHash, snapshotHash: hashObject(snapshotWithoutHash) };
+  let session = {
+    sessionId,
+    phase,
+    createdAt: new Date().toISOString(),
+    updatedAt: new Date().toISOString(),
+    sessionRoot: normalizePath(sessionRoot),
+    claimBearing: false,
+    sealed: false,
+    snapshot,
+    reservationsPath: normalizePath(relative(sessionRoot, reservationsPath)),
+    runManifestPath: 'run-manifest.jsonl',
+    artifactIndex: [],
+    sessionHash: ''
+  };
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  session = writeSession(sessionRoot, session);
+  console.log(`baseline snapshot wrote ${join(sessionRoot, 'BASELINE-SESSION.json')}`);
+}
+
+function validateBaselineArms(filePath) {
+  if (!filePath) throw new Error('--baseline-validate-arms requires a fixture path');
+  const fixture = readJson(filePath);
+  const errors = [];
+  if (fixture.phase !== 40) errors.push('baseline arms fixture must be Phase 40 metadata');
+  if (fixture.claimBearing !== false) errors.push('baseline arms must be non-claim-bearing');
+  if (!String(fixture.denominatorPolicy ?? '').includes('separate')) {
+    errors.push('baseline arms must stay separate from required competitor denominators');
+  }
+  const seen = new Set();
+  for (const arm of fixture.arms ?? []) {
+    if (!arm.baselineArmId || seen.has(arm.baselineArmId))
+      errors.push(`invalid duplicate baseline arm ${arm.baselineArmId}`);
+    seen.add(arm.baselineArmId);
+    if (arm.laneId !== 'codebase-context')
+      errors.push(`arm ${arm.baselineArmId} must stay under codebase-context`);
+    if (arm.claimBearing !== false)
+      errors.push(`arm ${arm.baselineArmId} must be non-claim-bearing`);
+    if (!Array.isArray(arm.allowedToolSurfaces) || arm.allowedToolSurfaces.length === 0) {
+      errors.push(`arm ${arm.baselineArmId} needs existing tool surfaces`);
+    }
+    if (arm.failurePolicy !== 'record_terminal_diagnostic_failure') {
+      errors.push(`arm ${arm.baselineArmId} must record failures instead of patching products`);
+    }
+  }
+  if (errors.length > 0)
+    throw new Error(`baseline arm validation failed:\n- ${errors.join('\n- ')}`);
+  console.log('baseline arm validation passed');
+}
+
+function runSetupIndexMeasure(args) {
+  if (!args.session) throw new Error('--setup-index-measure requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json')))
+    throw new Error('baseline session snapshot missing');
+  const fixtures = validateFixtures();
+  const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card]));
+  const lanes = args.allReadyLanes ? fixtures.lanes.broadClaimLaneSet : [args.lane];
+  let measured = 0;
+  for (const laneId of lanes) {
+    const laneCard = cardsByLane.get(laneId);
+    if (!laneCard) throw new Error(`unknown lane: ${laneId}`);
+    if (laneCard.laneId !== 'raw-native') {
+      if (args.allReadyLanes) continue;
+      throw new Error(
+        `setup/index measurement for ${laneCard.laneId} requires --setup-index-import until safe isolated command execution is implemented`
+      );
+    }
+    const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId);
+    const measurement = defaultRawNativeSetupIndex(sessionRoot, laneCard);
+    writeJson(paths.artifact, measurement);
+    measured += 1;
+  }
+  const session = readSession(sessionRoot);
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  writeSession(sessionRoot, session);
+  console.log(`setup/index measurement wrote ${measured} lane artifact(s)`);
+}
+
+function runSetupIndexImport(args) {
+  if (!args.session || !args.lane || !args.input)
+    throw new Error('--setup-index-import requires --session, --lane, and --input');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json')))
+    throw new Error('baseline session snapshot missing');
+  const fixtures = validateFixtures();
+  const laneCard = fixtures.laneToolCards.cards.find((card) => card.laneId === args.lane);
+  if (!laneCard) throw new Error(`unknown lane: ${args.lane}`);
+  const inputPath = isAbsolute(args.input) ? args.input : resolve(process.cwd(), args.input);
+  const imported = readJson(inputPath);
+  if (imported.laneId !== laneCard.laneId)
+    throw new Error(`setup/index import laneId mismatch: expected ${laneCard.laneId}`);
+  if (imported.claimBearing !== false)
+    throw new Error('setup/index import must be non-claim-bearing');
+  const measurement = {
+    ...imported,
+    setupCommand: imported.setupCommand ?? laneCard.setupCommand,
+    indexCommand: imported.indexCommand ?? laneCard.indexCommand,
+    setupLogPath: normalizeMeasurementLogPath(sessionRoot, imported.setupLogPath),
+    indexLogPath: normalizeMeasurementLogPath(sessionRoot, imported.indexLogPath),
+    importedFrom: normalizePath(inputPath),
+    importedAt: new Date().toISOString()
+  };
+  const errors = validateMeasuredSetupIndex(sessionRoot, laneCard, measurement);
+  if (errors.length > 0)
+    throw new Error(`setup/index import invalid for ${laneCard.laneId}:\n- ${errors.join('\n- ')}`);
+  const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId);
+  writeJson(paths.artifact, measurement);
+  const session = readSession(sessionRoot);
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  writeSession(sessionRoot, session);
+  console.log(`setup/index import wrote ${paths.artifact}`);
+}
+
+function makePrompt(task, laneCard, taskContext = null) {
+  const lines = [
+    `ContextBench task: ${task.instance_id}`,
+    `Repository: ${task.repo_url}`,
+    `Base commit: ${task.base_commit}`,
+    taskContext?.problemStatement
+      ? `Problem statement hash: ${task.problem_statement_hash}`
+      : `Problem statement reference: ${task.problem_statement_ref}`,
+    `Gold context reference is hidden from the solver; do not infer from fixture answers.`,
+    `Lane: ${laneCard.laneId}`,
+    `Allowed context tools: ${laneCard.allowedTools.join(', ')}`,
+    `Disallowed context tools: ${laneCard.disallowedTools.join(', ')}`,
+    'Return only JSON with fields: answer, confidence, evidence, filesReferenced, symbolsReferenced, unsupportedClaims, readyToEdit.',
+    'Do not use tools outside the lane card. Do not fabricate files or line spans.'
+  ];
+  if (taskContext?.repoCheckoutPath) {
+    lines.splice(3, 0, `Local checkout: ${taskContext.repoCheckoutPath}`);
+  }
+  if (taskContext?.problemStatement) {
+    lines.push('', 'Problem statement:', taskContext.problemStatement);
+  }
+  return lines.join('\n');
+}
+
+function parseAnswerForBaseline(stdout) {
+  const trimmed = String(stdout ?? '').trim();
+  if (!trimmed) return { answer: null, errors: ['missing_json'] };
+  try {
+    const parsed = JSON.parse(trimmed);
+    return validateStructuredAnswerObject(parsed);
+  } catch {
+    return { answer: null, errors: ['invalid_json'] };
+  }
+}
+
+function classifyClaudeCliDiagnostic(stdout, stderr) {
+  const text = `${stdout ?? ''}\n${stderr ?? ''}`.toLowerCase();
+  if (text.includes("you've hit your limit") || text.includes('rate limit'))
+    return 'claude_rate_limit';
+  if (text.includes('not authenticated') || text.includes('please run') || text.includes('login')) {
+    return 'claude_auth_required';
+  }
+  return null;
+}
+
+function parseClaudeAnswerForBaseline(stdout, stderr) {
+  const trimmed = String(stdout ?? '').trim();
+  const diagnostic = classifyClaudeCliDiagnostic(stdout, stderr);
+  if (!trimmed) {
+    return {
+      answer: null,
+      errors: diagnostic ? ['missing_json', diagnostic] : ['missing_json'],
+      toolError: diagnostic !== null
+    };
+  }
+
+  let parsed;
+  try {
+    parsed = JSON.parse(trimmed);
+  } catch {
+    return {
+      answer: null,
+      errors: diagnostic ? ['invalid_json', diagnostic] : ['invalid_json'],
+      toolError: diagnostic !== null
+    };
+  }
+
+  if (!isRecord(parsed) || parsed.type !== 'result') {
+    return { ...validateStructuredAnswerObject(parsed), toolError: false };
+  }
+
+  if (parsed.is_error === true) {
+    return {
+      answer: null,
+      errors: [`claude_error_${parsed.api_error_status ?? 'unknown'}`],
+      toolError: true
+    };
+  }
+
+  if (parsed.subtype === 'error_max_structured_output_retries') {
+    return { answer: null, errors: ['error_max_structured_output_retries'], toolError: false };
+  }
+
+  if ('structured_output' in parsed) {
+    return { ...validateStructuredAnswerObject(parsed.structured_output), toolError: false };
+  }
+
+  if (typeof parsed.result === 'string') {
+    return { ...parseAnswerForBaseline(parsed.result), toolError: false };
+  }
+
+  return { answer: null, errors: ['missing_structured_output'], toolError: false };
+}
+
+function isRecord(value) {
+  return value !== null && typeof value === 'object' && !Array.isArray(value);
+}
+
+function isStringArray(value) {
+  return Array.isArray(value) && value.every((entry) => typeof entry === 'string');
+}
+
+function findAdditionalFields(value, allowedFields, prefix) {
+  return Object.keys(value)
+    .filter((field) => !allowedFields.includes(field))
+    .map((field) => `additional_${prefix}_${field}`);
+}
+
+function isJsonValue(value) {
+  if (value === null) return true;
+  if (['string', 'number', 'boolean'].includes(typeof value)) return true;
+  if (Array.isArray(value)) return value.every(isJsonValue);
+  if (!isRecord(value)) return false;
+  return Object.values(value).every(isJsonValue);
+}
+
+function isValidEvidenceReference(value) {
+  if (!isRecord(value)) return false;
+  if (findAdditionalFields(value, EVIDENCE_REFERENCE_FIELDS, 'evidence_field').length > 0)
+    return false;
+  if (!isRecord(value.lineRange)) return false;
+  if (findAdditionalFields(value.lineRange, LINE_RANGE_FIELDS, 'line_range_field').length > 0)
+    return false;
+  const { start, end } = value.lineRange;
+  return (
+    typeof value.file === 'string' &&
+    value.file.trim().length > 0 &&
+    typeof value.reason === 'string' &&
+    value.reason.trim().length > 0 &&
+    Number.isInteger(start) &&
+    Number.isInteger(end) &&
+    start > 0 &&
+    end >= start
+  );
+}
+
+function validateStructuredAnswerObject(value) {
+  const errors = [];
+  if (!isRecord(value)) return { answer: null, errors: ['answer_root_not_object'] };
+  for (const field of CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS) {
+    if (!(field in value)) errors.push(`missing_${field}`);
+  }
+  errors.push(
+    ...findAdditionalFields(
+      value,
+      CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS,
+      'root_field'
+    )
+  );
+  if (!isJsonValue(value.answer)) errors.push('answer_not_json_value');
+  if (
+    typeof value.confidence !== 'string' ||
+    !['low', 'medium', 'high'].includes(value.confidence)
+  ) {
+    errors.push('invalid_confidence');
+  }
+  if (!Array.isArray(value.evidence)) errors.push('evidence_not_array');
+  if (!isStringArray(value.filesReferenced)) errors.push('files_referenced_not_string_array');
+  if (!isStringArray(value.symbolsReferenced)) errors.push('symbols_referenced_not_string_array');
+  if (!isStringArray(value.unsupportedClaims)) errors.push('unsupported_claims_not_string_array');
+  if (typeof value.readyToEdit !== 'boolean') errors.push('ready_to_edit_not_boolean');
+  const evidence = Array.isArray(value.evidence) ? value.evidence : [];
+  for (const entry of evidence) {
+    if (!isRecord(entry)) continue;
+    errors.push(...findAdditionalFields(entry, EVIDENCE_REFERENCE_FIELDS, 'evidence_field'));
+    if (isRecord(entry.lineRange)) {
+      errors.push(...findAdditionalFields(entry.lineRange, LINE_RANGE_FIELDS, 'line_range_field'));
+    }
+  }
+  if (evidence.some((entry) => !isValidEvidenceReference(entry)))
+    errors.push('malformed_evidence_reference');
+  if (errors.length > 0) return { answer: null, errors };
+  return { answer: value, errors: [] };
+}
+
+function defaultFakeAnswer(task) {
+  return {
+    answer: { smoke: true, taskId: task.instance_id },
+    confidence: 'medium',
+    evidence: [
+      {
+        file: 'SMOKE_ONLY.md',
+        lineRange: { start: 1, end: 1 },
+        reason: 'fake executor non-claim-bearing smoke evidence'
+      }
+    ],
+    filesReferenced: ['SMOKE_ONLY.md'],
+    symbolsReferenced: [],
+    unsupportedClaims: [],
+    readyToEdit: false
+  };
+}
+
+function claudeArgsForModel(model) {
+  const args = ['--print', '--output-format', 'json'];
+  if (model) args.push('--model', model);
+  args.push('--json-schema', JSON.stringify(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA));
+  return args;
+}
+
+function claudeCommandParts() {
+  return commandPartsForExecutor('claude');
+}
+
+function commandPartsForExecutor(executor) {
+  const envVars = {
+    claude: 'CONTEXTBENCH_CLAUDE_COMMAND',
+    codex: 'CONTEXTBENCH_CODEX_COMMAND',
+    gemini: 'CONTEXTBENCH_GEMINI_COMMAND',
+    opencode: 'CONTEXTBENCH_OPENCODE_COMMAND'
+  };
+  const defaults = {
+    claude: 'claude',
+    codex: 'codex',
+    gemini: 'gemini',
+    opencode: 'opencode'
+  };
+  const envVar = envVars[executor];
+  if (!envVar) throw new Error(`unsupported executor: ${executor}`);
+  const override = process.env[envVar];
+  if (!override) return { command: defaults[executor], prefixArgs: [] };
+  let parts;
+  try {
+    parts = JSON.parse(override);
+  } catch {
+    throw new Error(`${envVar} must be a JSON array`);
+  }
+  if (
+    !Array.isArray(parts) ||
+    parts.length === 0 ||
+    parts.some((part) => typeof part !== 'string')
+  ) {
+    throw new Error(`${envVar} must be a non-empty JSON string array`);
+  }
+  return { command: parts[0], prefixArgs: parts.slice(1) };
+}
+
+function externalExecutorInvocation(executor, model, prompt, paths) {
+  const schemaPath = join(paths.runDir, 'answer-schema.json');
+  const answerPath = join(paths.runDir, 'executor-answer.json');
+  writeJson(schemaPath, structuredAnswerSchemaForExecutor(executor));
+  if (executor === 'claude') {
+    return {
+      ...commandPartsForExecutor(executor),
+      args: claudeArgsForModel(model),
+      input: prompt,
+      schemaPath,
+      answerPath: null,
+      schemaMode: 'native_schema',
+      outputMode: 'json'
+    };
+  }
+  if (executor === 'codex') {
+    const args = [
+      'exec',
+      '--ephemeral',
+      '--sandbox',
+      'read-only',
+      '--json',
+      '--output-schema',
+      schemaPath,
+      '--output-last-message',
+      answerPath
+    ];
+    if (model) args.push('--model', model);
+    args.push('-');
+    return {
+      ...commandPartsForExecutor(executor),
+      args,
+      input: prompt,
+      schemaPath,
+      answerPath,
+      schemaMode: 'native_schema',
+      outputMode: 'jsonl'
+    };
+  }
+  if (executor === 'gemini') {
+    const args = ['--output-format', 'json'];
+    if (model) args.push('--model', model);
+    args.push('--prompt', prompt);
+    return {
+      ...commandPartsForExecutor(executor),
+      args,
+      input: '',
+      schemaPath,
+      answerPath: null,
+      schemaMode: 'prompt_only',
+      outputMode: 'json'
+    };
+  }
+  if (executor === 'opencode') {
+    const args = ['run', '--format', 'json'];
+    if (model) args.push('--model', model);
+    args.push(prompt);
+    return {
+      ...commandPartsForExecutor(executor),
+      args,
+      input: '',
+      schemaPath,
+      answerPath: null,
+      schemaMode: 'prompt_only',
+      outputMode: 'jsonl'
+    };
+  }
+  throw new Error(`unsupported executor: ${executor}`);
+}
+
+function structuredAnswerSchemaForExecutor(executor) {
+  if (executor !== 'codex') return CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA;
+  return {
+    ...CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA,
+    properties: {
+      ...CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties,
+      answer: { type: 'string' }
+    }
+  };
+}
+
+function parseExternalAnswerForBaseline(executor, stdout, stderr, answerPath) {
+  if (executor === 'claude') return parseClaudeAnswerForBaseline(stdout, stderr);
+  if (executor === 'codex' && answerPath && existsSync(answerPath)) {
+    const parsed = parseAnswerForBaseline(readFileSync(answerPath, 'utf8'));
+    if (parsed.answer) return { ...parsed, toolError: false };
+    const eventDiagnostic = classifyJsonEventDiagnostic(executor, stdout);
+    if (eventDiagnostic)
+      return { answer: null, errors: [...parsed.errors, eventDiagnostic], toolError: true };
+    return { ...parsed, toolError: false };
+  }
+  const diagnostic = classifyExternalCliDiagnostic(executor, stdout, stderr);
+  if (diagnostic) return { answer: null, errors: [diagnostic], toolError: true };
+  if (executor === 'gemini') return parseGeminiAnswer(stdout);
+  if (executor === 'opencode' || executor === 'codex') return parseJsonEventAnswer(stdout);
+  return { ...parseAnswerForBaseline(stdout), toolError: false };
+}
+
+function classifyJsonEventDiagnostic(executor, stdout) {
+  const lines = String(stdout ?? '')
+    .trim()
+    .split(/\r?\n/)
+    .filter(Boolean);
+  for (const line of lines) {
+    try {
+      const parsed = JSON.parse(line);
+      if (isRecord(parsed) && (parsed.type === 'error' || parsed.error)) return `${executor}_error`;
+    } catch {
+      // Non-JSON lines are handled by normal structured-answer parsing.
+    }
+  }
+  return null;
+}
+
+function classifyExternalCliDiagnostic(executor, stdout, stderr) {
+  const text = `${stdout ?? ''}\n${stderr ?? ''}`.toLowerCase();
+  if (
+    text.includes('not authenticated') ||
+    text.includes('login') ||
+    text.includes('auth required')
+  ) {
+    return `${executor}_auth_required`;
+  }
+  if (text.includes('rate limit') || text.includes('quota') || text.includes('limit exceeded')) {
+    return `${executor}_rate_limit`;
+  }
+  return null;
+}
+
+function parseGeminiAnswer(stdout) {
+  const trimmed = String(stdout ?? '').trim();
+  if (!trimmed) return { answer: null, errors: ['missing_json'], toolError: false };
+  try {
+    const parsed = JSON.parse(trimmed);
+    if (isRecord(parsed) && parsed.error)
+      return { answer: null, errors: ['gemini_error'], toolError: true };
+    if (isRecord(parsed) && typeof parsed.response === 'string') {
+      return { ...parseAnswerForBaseline(parsed.response), toolError: false };
+    }
+    if (isRecord(parsed) && typeof parsed.text === 'string') {
+      return { ...parseAnswerForBaseline(parsed.text), toolError: false };
+    }
+    return { ...validateStructuredAnswerObject(parsed), toolError: false };
+  } catch {
+    return { answer: null, errors: ['invalid_json'], toolError: false };
+  }
+}
+
+function parseJsonEventAnswer(stdout) {
+  const trimmed = String(stdout ?? '').trim();
+  if (!trimmed) return { answer: null, errors: ['missing_json'], toolError: false };
+  const diagnostic = classifyJsonEventDiagnostic('json_event', trimmed);
+  if (diagnostic) return { answer: null, errors: [diagnostic], toolError: true };
+  const lines = trimmed.split(/\r?\n/).filter(Boolean);
+  for (const line of [...lines].reverse()) {
+    try {
+      const parsed = JSON.parse(line);
+      if (isRecord(parsed)) {
+        for (const key of ['content', 'message', 'text', 'response']) {
+          if (typeof parsed[key] === 'string')
+            return { ...parseAnswerForBaseline(parsed[key]), toolError: false };
+        }
+        if (isRecord(parsed.part) && typeof parsed.part.text === 'string') {
+          return { ...parseAnswerForBaseline(parsed.part.text), toolError: false };
+        }
+        const direct = validateStructuredAnswerObject(parsed);
+        if (direct.answer) return { ...direct, toolError: false };
+      }
+    } catch {
+      // Continue scanning earlier JSONL events before declaring the stream invalid.
+    }
+  }
+  return parseAnswerForBaseline(trimmed);
+}
+
+function fakeStdoutForMode(mode, task) {
+  if (mode === 'invalid_schema') return 'not json';
+  return JSON.stringify(defaultFakeAnswer(task));
+}
+
+function runKey(laneId, taskId, repeatIndex, prefix = '') {
+  return `${prefix}${laneId}:${taskId}:${repeatIndex}`;
+}
+
+function existingRunKeys(sessionRoot) {
+  return new Set(
+    readManifestRowsIfPresent(sessionRoot).map((row) =>
+      runKey(row.lane_id, row.task_id, row.repeat_index, row.scoring?.baselineArmId ?? '')
+    )
+  );
+}
+
+function runOneBaselineAttempt(
+  sessionRoot,
+  fixtures,
+  laneCard,
+  task,
+  repeatIndex,
+  executor,
+  model,
+  timeoutMs,
+  fakeAnswerMode,
+  taskContext = null,
+  setupIndexOverride = null
+) {
+  const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeatIndex}-${executor}`);
+  const paths = buildRunPaths(sessionRoot, runId);
+  const startedAt = new Date().toISOString();
+  const prompt = makePrompt(task, laneCard, taskContext);
+  let stdout = '';
+  let stderr = '';
+  let answer;
+  let parseErrors = [];
+  let processMetadata = { exitStatus: null, signal: null, spawnError: null };
+  let externalInvocation = null;
+  let status = 'completed';
+  if (executor === 'fake') {
+    stdout = fakeStdoutForMode(fakeAnswerMode, task);
+    const parsed = parseAnswerForBaseline(stdout);
+    answer = parsed.answer;
+    parseErrors = parsed.errors;
+    if (!answer) status = stdout.trim() ? 'invalid_schema' : 'no_answer';
+  } else if (['claude', 'codex', 'gemini', 'opencode'].includes(executor)) {
+    externalInvocation = externalExecutorInvocation(executor, model, prompt, paths);
+    const result = spawnSync(
+      externalInvocation.command,
+      [...externalInvocation.prefixArgs, ...externalInvocation.args],
+      {
+        input: externalInvocation.input,
+        encoding: 'utf8',
+        timeout: timeoutMs,
+        cwd: taskContext?.repoCheckoutPath ?? undefined
+      }
+    );
+    stdout = result.stdout ?? '';
+    stderr = result.stderr ?? '';
+    processMetadata = {
+      exitStatus: typeof result.status === 'number' ? result.status : null,
+      signal: result.signal ?? null,
+      spawnError: result.error?.message ?? null
+    };
+    if (result.error && result.error.message.includes('ETIMEDOUT')) {
+      status = 'timeout';
+      stderr = `${stderr}\n${executor} invocation timed out after ${timeoutMs}ms`.trim();
+    } else if (result.status !== 0 && !stdout.trim()) {
+      status = 'tool_error';
+    }
+    const parsed = parseExternalAnswerForBaseline(
+      executor,
+      stdout,
+      stderr,
+      externalInvocation.answerPath
+    );
+    answer = parsed.answer;
+    parseErrors = parsed.errors;
+    if (parsed.toolError) status = 'tool_error';
+    if (!answer && status === 'completed') status = stdout.trim() ? 'invalid_schema' : 'no_answer';
+  } else {
+    throw new Error('--baseline-run executor must be fake, claude, codex, gemini, or opencode');
+  }
+  if (!answer) {
+    answer = {
+      answer: null,
+      confidence: 'low',
+      evidence: [],
+      filesReferenced: [],
+      symbolsReferenced: [],
+      unsupportedClaims: ['missing_or_invalid_structured_answer'],
+      readyToEdit: false
+    };
+  }
+  const trajectory = buildTrajectory(task, answer);
+  const setupIndex = setupIndexOverride ?? {
+    setupCommand: laneCard.setupCommand,
+    indexCommand: laneCard.indexCommand,
+    setupDurationMs: 0,
+    indexDurationMs: 0,
+    setupLogPath: paths.setupIndex,
+    indexLogPath: paths.setupIndex,
+    setupStatus: laneCard.setupCommand === 'none' ? 'not_required' : 'completed',
+    indexStatus: laneCard.indexCommand === 'none' ? 'not_required' : 'completed'
+  };
+  const rawTrace = {
+    executor,
+    model: executor === 'claude' ? model : 'fake-executor',
+    runnerHash: runnerSourceHash(),
+    claimBearing: false,
+    stdout,
+    stderr,
+    timeoutMs,
+    workingDirectory: taskContext?.repoCheckoutPath ?? process.cwd(),
+    taskContext: taskContext
+      ? {
+          materialized: taskContext.materialized,
+          errors: taskContext.errors,
+          repoCheckoutPath: taskContext.repoCheckoutPath,
+          actualHead: taskContext.actualHead,
+          statusShort: taskContext.statusShort,
+          baseCommitVerified: taskContext.baseCommitVerified,
+          remoteUrl: taskContext.remoteUrl,
+          problemStatementHash: taskContext.problemStatementHash,
+          problemStatementHashVerified: taskContext.problemStatementHashVerified,
+          verificationStrict: taskContext.verificationStrict
+        }
+      : null,
+    exitStatus: processMetadata.exitStatus,
+    signal: processMetadata.signal,
+    spawnError: processMetadata.spawnError,
+    claudeDiagnostic: executor === 'claude' ? classifyClaudeCliDiagnostic(stdout, stderr) : null,
+    executorDiagnostic:
+      executor !== 'fake' ? classifyExternalCliDiagnostic(executor, stdout, stderr) : null,
+    executorArgs: externalInvocation?.args ?? [],
+    executorCommand: externalInvocation?.command ?? null,
+    executorSchemaMode: externalInvocation?.schemaMode ?? null,
+    executorOutputMode: externalInvocation?.outputMode ?? null,
+    executorSchemaPath: externalInvocation?.schemaPath ?? null,
+    executorAnswerPath: externalInvocation?.answerPath ?? null,
+    toolCalls: [],
+    laneIsolation: buildLaneIsolationEvidence(laneCard),
+    claudeArgs:
+      executor === 'claude' ? (externalInvocation?.args ?? claudeArgsForModel(model)) : [],
+    claudeCommand:
+      executor === 'claude' ? (externalInvocation?.command ?? claudeCommandParts().command) : null,
+    structuredAnswerParseErrors: parseErrors,
+    scriptedAgentDecisions: false,
+    antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript
+  };
+  writeTextArtifact(paths.prompt, prompt);
+  writeJson(paths.laneCard, laneCard);
+  writeJson(paths.setupIndex, setupIndex);
+  writeJson(paths.rawTrace, rawTrace);
+  writeJson(paths.structuredAnswer, answer);
+  writeJson(paths.trajectory, trajectory);
+  const score = runOfficialEvaluatorForAttempt(fixtures, paths, task, executor, status);
+  writeJson(paths.score, score);
+  const completedAt = new Date().toISOString();
+  appendRunManifestRow(
+    sessionRoot,
+    buildManifestRowForArtifacts({
+      runId,
+      fixtures,
+      laneCard,
+      task,
+      repeatIndex,
+      status: status === 'completed' && score.status === 'judge_failed' ? 'judge_failed' : status,
+      startedAt,
+      completedAt,
+      paths,
+      setupIndex,
+      executor,
+      model: executor === 'fake' ? 'fake-executor' : model,
+      scoring: {
+        officialEvaluatorFirst: score.officialEvaluatorFirst,
+        officialEvaluatorAttempted: score.officialEvaluatorAttempted,
+        officialEvaluatorInvoked: score.officialEvaluatorInvoked,
+        command: score.command,
+        claimBearing: score.claimBearing,
+        ...(score.fallbackReason ? { fallbackReason: score.fallbackReason } : {}),
+        ...(score.stdoutPath ? { stdoutPath: score.stdoutPath } : {}),
+        ...(score.stderrPath ? { stderrPath: score.stderrPath } : {})
+      }
+    })
+  );
+}
+
+function writeTaskSetupFailedAttempt(
+  sessionRoot,
+  fixtures,
+  laneCard,
+  task,
+  repeatIndex,
+  executor,
+  model,
+  timeoutMs,
+  taskContext
+) {
+  const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeatIndex}-${executor}`);
+  const paths = buildRunPaths(sessionRoot, runId);
+  const startedAt = new Date().toISOString();
+  const completedAt = startedAt;
+  const prompt = makePrompt(task, laneCard, taskContext);
+  const setupIndex = {
+    setupCommand: laneCard.setupCommand,
+    indexCommand: laneCard.indexCommand,
+    setupDurationMs: 0,
+    indexDurationMs: 0,
+    setupLogPath: paths.setupIndex,
+    indexLogPath: paths.setupIndex,
+    setupStatus: 'setup_failed',
+    indexStatus: 'not_required',
+    taskMaterializationStatus: 'failed',
+    taskMaterializationErrors: taskContext.errors
+  };
+  const fallbackAnswer = {
+    answer: null,
+    confidence: 'low',
+    evidence: [],
+    filesReferenced: [],
+    symbolsReferenced: [],
+    unsupportedClaims: ['missing_or_invalid_task_context'],
+    readyToEdit: false
+  };
+  writeTextArtifact(paths.prompt, prompt);
+  writeJson(paths.laneCard, laneCard);
+  writeJson(paths.setupIndex, setupIndex);
+  writeJson(paths.rawTrace, {
+    executor,
+    model: executor === 'fake' ? 'fake-executor' : model,
+    runnerHash: runnerSourceHash(),
+    claimBearing: false,
+    status: 'task_setup_failed',
+    timeoutMs,
+    workingDirectory: process.cwd(),
+    taskContext: {
+      materialized: false,
+      errors: taskContext.errors,
+      repoCheckoutPath: taskContext.repoCheckoutPath,
+      actualHead: taskContext.actualHead,
+      statusShort: taskContext.statusShort,
+      baseCommitVerified: taskContext.baseCommitVerified,
+      remoteUrl: taskContext.remoteUrl,
+      problemStatementHash: taskContext.problemStatementHash,
+      problemStatementHashVerified: taskContext.problemStatementHashVerified,
+      verificationStrict: taskContext.verificationStrict
+    },
+    stdout: '',
+    stderr: `task context materialization failed: ${taskContext.errors.join(', ')}`,
+    exitStatus: null,
+    signal: null,
+    spawnError: null,
+    structuredAnswerParseErrors: ['invalid_task_context'],
+    toolCalls: [],
+    laneIsolation: buildLaneIsolationEvidence(laneCard),
+    scriptedAgentDecisions: false,
+    antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript
+  });
+  writeJson(paths.structuredAnswer, fallbackAnswer);
+  writeJson(paths.trajectory, buildTrajectory(task, fallbackAnswer));
+  writeJson(paths.score, {
+    status: 'task_setup_failed',
+    mode: 'materialization_gate',
+    ...diagnosticFallbackScoring(
+      fixtures,
+      `invalid_task_context:${taskContext.errors.join(',')}`
+    )
+  });
+  appendRunManifestRow(
+    sessionRoot,
+    buildManifestRowForArtifacts({
+      runId,
+      fixtures,
+      laneCard,
+      task,
+      repeatIndex,
+      status: 'task_setup_failed',
+      startedAt,
+      completedAt,
+      paths,
+      setupIndex,
+      executor,
+      model: executor === 'fake' ? 'fake-executor' : model,
+      scoring: diagnosticFallbackScoring(
+        fixtures,
+        `invalid_task_context:${taskContext.errors.join(',')}`
+      )
+    })
+  );
+}
+
+function setupIndexForBaselineAttempt(sessionRoot, laneCard) {
+  const measured = readMeasuredSetupIndex(sessionRoot, laneCard);
+  if (measured) {
+    if (
+      ['completed', 'not_required'].includes(measured.setupStatus) &&
+      ['completed', 'not_required'].includes(measured.indexStatus)
+    ) {
+      return measured;
+    }
+    return null;
+  }
+  if (laneCard.laneId === 'raw-native') {
+    const paths = buildSetupIndexMeasurementPaths(sessionRoot, laneCard.laneId);
+    const measurement = defaultRawNativeSetupIndex(sessionRoot, laneCard);
+    writeJson(paths.artifact, measurement);
+    return rowSetupIndexFromMeasurement(measurement);
+  }
+  return null;
+}
+
+function writeSetupIndexMissingAttempt(
+  sessionRoot,
+  fixtures,
+  laneCard,
+  task,
+  repeatIndex,
+  executor,
+  model,
+  timeoutMs,
+  reason
+) {
+  const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeatIndex}-${executor}`);
+  const paths = buildRunPaths(sessionRoot, runId);
+  const startedAt = new Date().toISOString();
+  const completedAt = startedAt;
+  const prompt = `Setup/index measurement missing for ${task.instance_id} in ${laneCard.laneId}; no agent task prompt executed.`;
+  const setupIndex = {
+    setupCommand: laneCard.setupCommand,
+    indexCommand: laneCard.indexCommand,
+    setupDurationMs: 0,
+    indexDurationMs: 0,
+    setupLogPath: paths.setupIndex,
+    indexLogPath: paths.setupIndex,
+    setupStatus: 'setup_failed',
+    indexStatus: 'not_required'
+  };
+  const fallbackAnswer = {
+    answer: null,
+    confidence: 'low',
+    evidence: [],
+    filesReferenced: [],
+    symbolsReferenced: [],
+    unsupportedClaims: ['missing_setup_index_measurement'],
+    readyToEdit: false
+  };
+  writeTextArtifact(paths.prompt, prompt);
+  writeJson(paths.laneCard, laneCard);
+  writeJson(paths.setupIndex, { ...setupIndex, reason });
+  writeJson(paths.rawTrace, {
+    executor,
+    model: executor === 'fake' ? 'fake-executor' : model,
+    runnerHash: runnerSourceHash(),
+    claimBearing: false,
+    status: 'setup_failed',
+    timeoutMs,
+    workingDirectory: process.cwd(),
+    stdout: '',
+    stderr: reason,
+    exitStatus: null,
+    signal: null,
+    spawnError: null,
+    structuredAnswerParseErrors: ['missing_setup_index_measurement'],
+    toolCalls: [],
+    laneIsolation: buildLaneIsolationEvidence(laneCard),
+    scriptedAgentDecisions: false,
+    antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript
+  });
+  writeJson(paths.structuredAnswer, fallbackAnswer);
+  writeJson(paths.trajectory, buildTrajectory(task, fallbackAnswer));
+  writeJson(paths.score, {
+    status: 'setup_failed',
+    mode: 'setup_index_measurement_gate',
+    ...diagnosticFallbackScoring(fixtures, `missing_setup_index_measurement:${reason}`)
+  });
+  appendRunManifestRow(
+    sessionRoot,
+    buildManifestRowForArtifacts({
+      runId,
+      fixtures,
+      laneCard,
+      task,
+      repeatIndex,
+      status: 'setup_failed',
+      startedAt,
+      completedAt,
+      paths,
+      setupIndex,
+      executor,
+      model: executor === 'fake' ? 'fake-executor' : model,
+      scoring: diagnosticFallbackScoring(fixtures, `missing_setup_index_measurement:${reason}`)
+    })
+  );
+}
+
+function runBaseline(args) {
+  if (!args.session) throw new Error('--baseline-run requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json')))
+    throw new Error('baseline session snapshot missing');
+  const fixtures = validateFixtures();
+  const cardsByLane = new Map(fixtures.laneToolCards.cards.map((card) => [card.laneId, card]));
+  const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task]));
+  const evidenceByLane = new Map(
+    fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
+  );
+  const repeats = args.repeats ?? args.repeat ?? 1;
+  const taskPayloads = readTaskPayloads(args.taskPayloads);
+  const maxAttempts =
+    Number.isInteger(args.maxAttempts) && args.maxAttempts > 0 ? args.maxAttempts : Infinity;
+  const timeoutMs =
+    Number.isInteger(args.timeoutMs) && args.timeoutMs > 0
+      ? args.timeoutMs
+      : fixtures.protocol.budgets.defaults.timeoutSeconds * 1000;
+  const existing = existingRunKeys(sessionRoot);
+  let attempted = 0;
+  const lanes = args.allReadyLanes
+    ? fixtures.lanes.broadClaimLaneSet.filter(
+        (laneId) => evidenceByLane.get(laneId)?.readinessStatus === 'ready_for_phase40'
+      )
+    : [args.lane];
+  const tasks = args.taskId
+    ? [args.taskId]
+    : fixtures.manifest.tasks.map((task) => task.instance_id);
+  for (const laneId of lanes) {
+    const laneCard = cardsByLane.get(laneId);
+    if (!laneCard) throw new Error(`unknown lane: ${laneId}`);
+    if (BLOCKED_LANE_SETUP_STATUSES.has(evidenceByLane.get(laneId)?.readinessStatus)) continue;
+    for (const taskId of tasks) {
+      const task = tasksById.get(taskId);
+      if (!task) throw new Error(`unknown task-id: ${taskId}`);
+      for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) {
+        if (existing.has(runKey(laneCard.laneId, task.instance_id, repeatIndex))) continue;
+        if (attempted >= maxAttempts) break;
+        const executor = args.executor ?? 'fake';
+        const measuredSetupIndex = setupIndexForBaselineAttempt(sessionRoot, laneCard);
+        if (!measuredSetupIndex) {
+          writeSetupIndexMissingAttempt(
+            sessionRoot,
+            fixtures,
+            laneCard,
+            task,
+            repeatIndex,
+            executor,
+            args.model ?? 'unspecified',
+            timeoutMs,
+            `${laneCard.laneId} requires --setup-index-import before task execution`
+          );
+          attempted += 1;
+          continue;
+        }
+        const taskContext = resolveTaskContext(task, taskPayloads, executor);
+        if (executor !== 'fake' && !taskContext.materialized) {
+          writeTaskSetupFailedAttempt(
+            sessionRoot,
+            fixtures,
+            laneCard,
+            task,
+            repeatIndex,
+            executor,
+            args.model ?? 'unspecified',
+            timeoutMs,
+            taskContext
+          );
+          attempted += 1;
+          continue;
+        }
+        runOneBaselineAttempt(
+          sessionRoot,
+          fixtures,
+          laneCard,
+          task,
+          repeatIndex,
+          executor,
+          args.model ?? 'unspecified',
+          timeoutMs,
+          args.fakeAnswerMode ?? 'valid',
+          taskContext,
+          measuredSetupIndex
+        );
+        attempted += 1;
+      }
+      if (attempted >= maxAttempts) break;
+    }
+    if (attempted >= maxAttempts) break;
+  }
+  const session = readSession(sessionRoot);
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  writeSession(sessionRoot, session);
+  console.log(
+    `baseline run updated ${join(sessionRoot, 'run-manifest.jsonl')} (${attempted} new attempts)`
+  );
+}
+
+function runOneCodebaseContextArmAttempt(
+  sessionRoot,
+  fixtures,
+  laneCard,
+  task,
+  arm,
+  repeatIndex,
+  executor,
+  model,
+  timeoutMs,
+  fakeAnswerMode,
+  taskContext = null
+) {
+  const runId = sanitize(`${arm.baselineArmId}-${task.instance_id}-${repeatIndex}-${executor}`);
+  const paths = buildRunPaths(sessionRoot, runId);
+  const startedAt = new Date().toISOString();
+  const prompt = [
+    makePrompt(task, laneCard, taskContext),
+    `Diagnostic baseline arm: ${arm.baselineArmId}`,
+    `Allowed existing codebase-context surfaces for this arm: ${arm.allowedToolSurfaces.join(', ')}`,
+    'This diagnostic arm is not a required competitor lane denominator and is not claim-bearing.'
+  ].join('\n');
+  let stdout = '';
+  let stderr = '';
+  let answer;
+  let parseErrors = [];
+  let processMetadata = { exitStatus: null, signal: null, spawnError: null };
+  let status = 'completed';
+  if (executor !== 'fake' && taskContext && !taskContext.materialized) {
+    status = 'task_setup_failed';
+    stderr = `task context materialization failed: ${taskContext.errors.join(', ')}`;
+    parseErrors = ['invalid_task_context'];
+  } else if (executor === 'fake') {
+    stdout = fakeStdoutForMode(fakeAnswerMode, task);
+    const parsed = parseAnswerForBaseline(stdout);
+    answer = parsed.answer;
+    parseErrors = parsed.errors;
+    if (!answer) status = stdout.trim() ? 'invalid_schema' : 'no_answer';
+  } else if (executor === 'claude') {
+    const claudeArgs = claudeArgsForModel(model);
+    const claudeCommand = claudeCommandParts();
+    const result = spawnSync(claudeCommand.command, [...claudeCommand.prefixArgs, ...claudeArgs], {
+      input: prompt,
+      encoding: 'utf8',
+      timeout: timeoutMs,
+      cwd: taskContext?.repoCheckoutPath ?? undefined
+    });
+    stdout = result.stdout ?? '';
+    stderr = result.stderr ?? '';
+    processMetadata = {
+      exitStatus: typeof result.status === 'number' ? result.status : null,
+      signal: result.signal ?? null,
+      spawnError: result.error?.message ?? null
+    };
+    if (result.error && result.error.message.includes('ETIMEDOUT')) {
+      status = 'timeout';
+      stderr = `${stderr}\nClaude invocation timed out after ${timeoutMs}ms`.trim();
+    } else if (result.status !== 0 && !stdout.trim()) {
+      status = 'tool_error';
+    }
+    const parsed = parseClaudeAnswerForBaseline(stdout, stderr);
+    answer = parsed.answer;
+    parseErrors = parsed.errors;
+    if (parsed.toolError) status = 'tool_error';
+    if (!answer && status === 'completed') status = stdout.trim() ? 'invalid_schema' : 'no_answer';
+  } else {
+    throw new Error('--baseline-run-codebase-context-arms executor must be fake or claude');
+  }
+  if (!answer) {
+    answer = {
+      answer: null,
+      confidence: 'low',
+      evidence: [],
+      filesReferenced: [],
+      symbolsReferenced: [],
+      unsupportedClaims: ['missing_or_invalid_structured_answer'],
+      readyToEdit: false
+    };
+  }
+  const setupIndex = {
+    setupCommand: arm.setupCommand,
+    indexCommand: laneCard.indexCommand,
+    setupDurationMs: 0,
+    indexDurationMs: 0,
+    setupLogPath: paths.setupIndex,
+    indexLogPath: paths.setupIndex,
+    setupStatus: 'completed',
+    indexStatus: 'completed'
+  };
+  const trajectory = buildTrajectory(task, answer);
+  const score = {
+    status: status === 'completed' ? 'judge_failed' : status,
+    mode: 'diagnostic_fallback',
+    ...diagnosticFallbackScoring(
+      fixtures,
+      executor === 'fake'
+        ? 'fake_executor_diagnostic_arm_smoke_only'
+        : 'official_evaluator_not_invoked_by_runner_smoke',
+      { baselineArmId: arm.baselineArmId }
+    )
+  };
+  writeTextArtifact(paths.prompt, prompt);
+  writeJson(paths.laneCard, { ...laneCard, diagnosticBaselineArm: arm });
+  writeJson(paths.setupIndex, { ...setupIndex, diagnosticBaselineArm: arm });
+  writeJson(paths.rawTrace, {
+    executor,
+    model: executor === 'claude' ? model : 'fake-executor',
+    runnerHash: runnerSourceHash(),
+    claimBearing: false,
+    baselineArmId: arm.baselineArmId,
+    stdout,
+    stderr,
+    timeoutMs,
+    workingDirectory: taskContext?.repoCheckoutPath ?? process.cwd(),
+    taskContext: taskContext
+      ? {
+          materialized: taskContext.materialized,
+          errors: taskContext.errors,
+          repoCheckoutPath: taskContext.repoCheckoutPath,
+          actualHead: taskContext.actualHead,
+          statusShort: taskContext.statusShort,
+          baseCommitVerified: taskContext.baseCommitVerified,
+          remoteUrl: taskContext.remoteUrl,
+          problemStatementHash: taskContext.problemStatementHash,
+          problemStatementHashVerified: taskContext.problemStatementHashVerified,
+          verificationStrict: taskContext.verificationStrict
+        }
+      : null,
+    exitStatus: processMetadata.exitStatus,
+    signal: processMetadata.signal,
+    spawnError: processMetadata.spawnError,
+    claudeDiagnostic: executor === 'claude' ? classifyClaudeCliDiagnostic(stdout, stderr) : null,
+    toolCalls: [],
+    laneIsolation: buildLaneIsolationEvidence(laneCard),
+    claudeArgs: executor === 'claude' ? claudeArgsForModel(model) : [],
+    claudeCommand: executor === 'claude' ? claudeCommandParts().command : null,
+    structuredAnswerParseErrors: parseErrors,
+    scriptedAgentDecisions: false,
+    antiScriptingBoundary: fixtures.protocol.minimalRunnerBehavior.mustNotScript
+  });
+  writeJson(paths.structuredAnswer, answer);
+  writeJson(paths.trajectory, trajectory);
+  writeJson(paths.score, score);
+  const completedAt = new Date().toISOString();
+  appendRunManifestRow(
+    sessionRoot,
+    buildManifestRowForArtifacts({
+      runId,
+      fixtures,
+      laneCard,
+      task,
+      repeatIndex,
+      status,
+      startedAt,
+      completedAt,
+      paths,
+      setupIndex,
+      executor,
+      model: executor === 'fake' ? 'fake-executor' : model,
+      scoring: diagnosticFallbackScoring(fixtures, score.fallbackReason, {
+        baselineArmId: arm.baselineArmId
+      })
+    })
+  );
+}
+
+function runBaselineCodebaseContextArms(args) {
+  if (!args.session)
+    throw new Error('--baseline-run-codebase-context-arms requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  if (!existsSync(join(sessionRoot, 'BASELINE-SESSION.json')))
+    throw new Error('baseline session snapshot missing');
+  validateBaselineArms(FIXTURES.codebaseContextBaselineArms);
+  const fixtures = validateFixtures();
+  const arms = readJson(FIXTURES.codebaseContextBaselineArms).arms ?? [];
+  const laneCard = fixtures.laneToolCards.cards.find((card) => card.laneId === 'codebase-context');
+  if (!laneCard) throw new Error('codebase-context lane card missing');
+  const tasks = args.taskId
+    ? fixtures.manifest.tasks.filter((task) => task.instance_id === args.taskId)
+    : fixtures.manifest.tasks;
+  if (args.taskId && tasks.length === 0) throw new Error(`unknown task-id: ${args.taskId}`);
+  const repeats = args.repeats ?? args.repeat ?? 1;
+  const maxAttempts =
+    Number.isInteger(args.maxAttempts) && args.maxAttempts > 0 ? args.maxAttempts : Infinity;
+  const timeoutMs =
+    Number.isInteger(args.timeoutMs) && args.timeoutMs > 0
+      ? args.timeoutMs
+      : fixtures.protocol.budgets.defaults.timeoutSeconds * 1000;
+  const existing = existingRunKeys(sessionRoot);
+  const taskPayloads = readTaskPayloads(args.taskPayloads);
+  let attempted = 0;
+  for (const arm of arms) {
+    for (const task of tasks) {
+      for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) {
+        if (
+          existing.has(runKey('codebase-context', task.instance_id, repeatIndex, arm.baselineArmId))
+        )
+          continue;
+        if (attempted >= maxAttempts) break;
+        const executor = args.executor ?? 'fake';
+        const taskContext = resolveTaskContext(task, taskPayloads, executor);
+        runOneCodebaseContextArmAttempt(
+          sessionRoot,
+          fixtures,
+          laneCard,
+          task,
+          arm,
+          repeatIndex,
+          executor,
+          args.model ?? 'unspecified',
+          timeoutMs,
+          args.fakeAnswerMode ?? 'valid',
+          taskContext
+        );
+        attempted += 1;
+      }
+      if (attempted >= maxAttempts) break;
+    }
+    if (attempted >= maxAttempts) break;
+  }
+  const session = readSession(sessionRoot);
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  writeSession(sessionRoot, session);
+  console.log(
+    `baseline codebase-context diagnostic arms updated ${join(sessionRoot, 'run-manifest.jsonl')} (${attempted} new attempts)`
+  );
+}
+
+function readManifestRowsIfPresent(sessionRoot) {
+  const manifestPath = join(sessionRoot, 'run-manifest.jsonl');
+  if (!existsSync(manifestPath)) return [];
+  const content = readFileSync(manifestPath, 'utf8').trim();
+  if (!content) return [];
+  return content.split('\n').map((line) => JSON.parse(line));
+}
+
+function validateSessionPaths(sessionRoot, rows, errors) {
+  for (const row of rows) {
+    for (const key of [
+      'raw_trace_path',
+      'structured_answer_path',
+      'trajectory_path',
+      'score_path',
+      'setup_index_path',
+      'prompt_path',
+      'lane_tool_card_path'
+    ]) {
+      const value = row[key];
+      if (!value || !isAbsolute(value)) errors.push(`row ${row.run_id} ${key} must be absolute`);
+      else if (!isPathInside(sessionRoot, value))
+        errors.push(`row ${row.run_id} ${key} is outside session root`);
+      else if (!existsSync(value)) errors.push(`row ${row.run_id} ${key} missing artifact`);
+    }
+    if (row.setupIndex && 'taskWallTimeMs' in row.setupIndex)
+      errors.push(`row ${row.run_id} mixes task time into setupIndex`);
+    if (row.scoring?.claimBearing !== false)
+      errors.push(
+        `row ${row.run_id} scoring must be non-claim-bearing while protocol claimAllowed=false`
+      );
+  }
+}
+
+function phase42RowKey(row) {
+  return `${row.lane_id}\u0000${row.task_id}\u0000${row.repeat_index}`;
+}
+
+function phase42ExpectedKeys(fixtures) {
+  const keys = new Set();
+  const repeats = fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? 3;
+  for (const laneId of fixtures.lanes.broadClaimLaneSet) {
+    for (const task of fixtures.manifest.tasks) {
+      for (let repeatIndex = 1; repeatIndex <= repeats; repeatIndex += 1) {
+        keys.add(`${laneId}\u0000${task.instance_id}\u0000${repeatIndex}`);
+      }
+    }
+  }
+  return keys;
+}
+
+function phase42LanePolicies(fixtures) {
+  return Object.fromEntries(
+    fixtures.laneToolCards.cards.map((card) => [
+      card.laneId,
+      {
+        laneId: card.laneId,
+        expectedContextTool: card.contextTools[0] ?? card.laneId,
+        allowedTools: card.allowedTools,
+        disallowedTools: card.disallowedTools,
+        ...(card.laneId === 'raw-native' ? { allowMultipleObservedTools: true } : {})
+      }
+    ])
+  );
+}
+
+function phase42ReadJsonArtifact(filePath, readErrors, runId, label) {
+  if (!filePath || !existsSync(filePath)) {
+    readErrors.push({ runId, path: filePath ?? '', reason: `${label}_missing` });
+    return null;
+  }
+  try {
+    return readJson(filePath);
+  } catch (error) {
+    readErrors.push({
+      runId,
+      path: filePath,
+      reason: `${label}_invalid_json:${error instanceof Error ? error.message : String(error)}`
+    });
+    return null;
+  }
+}
+
+function phase42HashArtifact(filePath, artifactHashesByPath, readErrors, runId, label) {
+  if (!filePath || !existsSync(filePath)) {
+    readErrors.push({ runId, path: filePath ?? '', reason: `${label}_missing` });
+    return null;
+  }
+  const hash = hashFile(filePath);
+  artifactHashesByPath[filePath] = hash;
+  return hash;
+}
+
+function phase42CollectArtifactHashes(row, score, artifactHashesByPath, readErrors, integrityErrors) {
+  for (const [label, filePath] of [
+    ['raw_trace', row.raw_trace_path, 'rawTrace'],
+    ['structured_answer', row.structured_answer_path, 'structuredAnswer'],
+    ['trajectory', row.trajectory_path, 'trajectory'],
+    ['score', row.score_path, 'score'],
+    ['setup_index', row.setup_index_path, 'setupIndex'],
+    ['prompt', row.prompt_path, 'prompt'],
+    ['lane_tool_card', row.lane_tool_card_path, 'laneToolCard']
+  ]) {
+    const actualHash = phase42HashArtifact(filePath, artifactHashesByPath, readErrors, row.run_id, label);
+    const expectedHash = row.hashes?.[label === 'lane_tool_card' ? 'laneToolCard' : label === 'setup_index' ? 'setupIndex' : label === 'raw_trace' ? 'rawTrace' : label === 'structured_answer' ? 'structuredAnswer' : label];
+    if (actualHash && !expectedHash) {
+      integrityErrors.push({
+        runId: row.run_id,
+        path: filePath,
+        reason: `${label}_manifest_hash_missing`,
+        expectedHash: null,
+        actualHash
+      });
+    } else if (actualHash && expectedHash && actualHash !== expectedHash) {
+      integrityErrors.push({
+        runId: row.run_id,
+        path: filePath,
+        reason: `${label}_hash_mismatch`,
+        expectedHash,
+        actualHash
+      });
+    }
+  }
+  for (const [label, filePath] of [
+    ['official_output', score?.outputPath],
+    ['official_stdout', score?.stdoutPath],
+    ['official_stderr', score?.stderrPath]
+  ]) {
+    if (filePath) phase42HashArtifact(filePath, artifactHashesByPath, readErrors, row.run_id, label);
+  }
+}
+
+function phase42ArtifactsForRow(row, readErrors) {
+  const rawTrace = phase42ReadJsonArtifact(row.raw_trace_path, readErrors, row.run_id, 'raw_trace');
+  const score = phase42ReadJsonArtifact(row.score_path, readErrors, row.run_id, 'score');
+  const setupIndex = phase42ReadJsonArtifact(
+    row.setup_index_path,
+    readErrors,
+    row.run_id,
+    'setup_index'
+  );
+  return {
+    rawTrace: rawTrace
+      ? {
+          executor: rawTrace.executor,
+          model: rawTrace.model,
+          runnerHash: rawTrace.runnerHash
+        }
+      : undefined,
+    score: score
+      ? {
+          status: score.status,
+          mode: score.mode,
+          claimBearing: score.claimBearing,
+          officialEvaluatorInvoked: score.officialEvaluatorInvoked,
+          command: score.command,
+          exitCode: score.exitCode,
+          outputPath: score.outputPath,
+          outputHash: score.outputHash,
+          stdoutPath: score.stdoutPath,
+          stderrPath: score.stderrPath,
+          stdoutHash: score.stdoutHash,
+          stderrHash: score.stderrHash
+        }
+      : undefined,
+    setupIndex: setupIndex
+      ? {
+          setupStatus: setupIndex.setupStatus,
+          indexStatus: setupIndex.indexStatus,
+          setupDurationMs: setupIndex.setupDurationMs,
+          indexDurationMs: setupIndex.indexDurationMs,
+          setupLogPath: setupIndex.setupLogPath,
+          indexLogPath: setupIndex.indexLogPath
+        }
+      : undefined,
+    laneIsolation: rawTrace?.laneIsolation
+      ? {
+          laneId: rawTrace.laneIsolation.laneId,
+          proven: rawTrace.laneIsolation.proven,
+          sourceKind: rawTrace.laneIsolation.sourceKind,
+          expectedContextTool: rawTrace.laneIsolation.expectedContextTool,
+          allowedTools: rawTrace.laneIsolation.allowedTools ?? [],
+          observedTools: rawTrace.laneIsolation.observedTools ?? [],
+          violations: rawTrace.laneIsolation.violations ?? []
+        }
+      : undefined,
+    rawScore: score
+  };
+}
+
+function loadPhase42SessionEvidence(sessionRoot, fixtures) {
+  const sessionPath = join(sessionRoot, 'BASELINE-SESSION.json');
+  const reservationPath = join(sessionRoot, 'slot-reservations.json');
+  const session = existsSync(sessionPath) ? readJson(sessionPath) : null;
+  const reservations = existsSync(reservationPath)
+    ? (readJson(reservationPath).reservations ?? [])
+    : [];
+  const rows = readManifestRowsIfPresent(sessionRoot);
+  const expectedKeys = phase42ExpectedKeys(fixtures);
+  const requiredRows = [];
+  const supplementalRows = [];
+  const unexpectedRows = [];
+  for (const row of rows) {
+    if (row.scoring?.baselineArmId) {
+      supplementalRows.push(row);
+      continue;
+    }
+    if (expectedKeys.has(phase42RowKey(row))) requiredRows.push(row);
+    else unexpectedRows.push(row);
+  }
+
+  const readErrors = [];
+  const integrityErrors = [];
+  const artifactHashesByPath = {};
+  const artifactsByRunId = {};
+  for (const row of requiredRows) {
+    const artifacts = phase42ArtifactsForRow(row, readErrors);
+    phase42CollectArtifactHashes(
+      row,
+      artifacts.rawScore,
+      artifactHashesByPath,
+      readErrors,
+      integrityErrors
+    );
+    delete artifacts.rawScore;
+    artifactsByRunId[row.run_id] = artifacts;
+  }
+
+  const runnerHashes = requiredRows
+    .map((row) => row.hashes?.runnerSourceHash)
+    .filter((hash) => typeof hash === 'string' && hash.length > 0);
+  const uniqueRunnerHashes = [...new Set(runnerHashes)];
+  const expectedRunnerHash = uniqueRunnerHashes.length === 1 ? uniqueRunnerHashes[0] : undefined;
+  const repeats = fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? 3;
+  return {
+    session,
+    reservations,
+    requiredRows,
+    supplementalRows,
+    unexpectedRows,
+    readErrors,
+    integrityErrors,
+    gateInput: {
+      evidenceMode: 'artifact_verified',
+      protocol: {
+        claimAllowed: fixtures.protocol.claimAllowed,
+        benchmarkTarget: {
+          officialEvaluatorFirst: fixtures.protocol.benchmarkTarget.officialEvaluatorFirst
+        }
+      },
+      requiredLaneIds: fixtures.lanes.broadClaimLaneSet,
+      requiredTaskIds: fixtures.manifest.tasks.map((task) => task.instance_id),
+      requiredRepeats: repeats,
+      expectedTotalRows: fixtures.lanes.broadClaimLaneSet.length * fixtures.manifest.tasks.length * repeats,
+      expectedProtocolHash: hashObject(fixtures.protocol),
+      expectedTaskManifestHash: fixtures.manifest.manifest_hash,
+      lanePoliciesById: phase42LanePolicies(fixtures),
+      rows: requiredRows,
+      artifactsByRunId,
+      artifactHashesByPath,
+      expectedRunnerHash,
+      currentRunnerHash: runnerSourceHash()
+    }
+  };
+}
+
+function phase42HasMeasuredSetupIndex(row, evidence) {
+  if (!evidence) return false;
+  const setupDuration = evidence.setupDurationMs;
+  const indexDuration = evidence.indexDurationMs;
+  if (typeof setupDuration !== 'number' || typeof indexDuration !== 'number') return false;
+  if (!Number.isFinite(setupDuration) || !Number.isFinite(indexDuration)) return false;
+  if (!evidence.setupStatus || !evidence.indexStatus) return false;
+  if (!evidence.setupLogPath || !evidence.indexLogPath) return false;
+  if (!['completed', 'not_required'].includes(evidence.setupStatus)) return false;
+  if (!['completed', 'not_required'].includes(evidence.indexStatus)) return false;
+  if (evidence.setupStatus === 'completed' && setupDuration <= 0) return false;
+  if (evidence.indexStatus === 'completed' && indexDuration <= 0) return false;
+  return (
+    row.setupIndex.setupStatus === evidence.setupStatus &&
+    row.setupIndex.indexStatus === evidence.indexStatus &&
+    row.setupIndex.setupDurationMs === evidence.setupDurationMs &&
+    row.setupIndex.indexDurationMs === evidence.indexDurationMs &&
+    row.setupIndex.setupLogPath === evidence.setupLogPath &&
+    row.setupIndex.indexLogPath === evidence.indexLogPath
+  );
+}
+
+function phase42HasSha256Hash(value) {
+  return /^sha256:[a-f0-9]{64}$/.test(value ?? '');
+}
+
+function phase42HasOfficialEvaluatorProof(row, score, artifactHashesByPath) {
+  return (
+    row.scoring.officialEvaluatorFirst === true &&
+    row.scoring.officialEvaluatorAttempted === true &&
+    row.scoring.officialEvaluatorInvoked === true &&
+    row.scoring.claimBearing === true &&
+    score?.officialEvaluatorInvoked === true &&
+    score.claimBearing === true &&
+    score.mode === 'official_evaluator' &&
+    score.status === 'completed' &&
+    score.exitCode === 0 &&
+    typeof score.command === 'string' &&
+    score.command.includes('contextbench.evaluate') &&
+    typeof score.outputPath === 'string' &&
+    score.outputPath.length > 0 &&
+    phase42HasSha256Hash(score.outputHash) &&
+    artifactHashesByPath[score.outputPath] === score.outputHash &&
+    phase42HasSha256Hash(artifactHashesByPath[row.score_path]) &&
+    typeof score.stdoutPath === 'string' &&
+    score.stdoutPath.length > 0 &&
+    phase42HasSha256Hash(score.stdoutHash) &&
+    artifactHashesByPath[score.stdoutPath] === score.stdoutHash &&
+    phase42HasSha256Hash(artifactHashesByPath[score.stdoutPath]) &&
+    typeof score.stderrPath === 'string' &&
+    score.stderrPath.length > 0 &&
+    phase42HasSha256Hash(score.stderrHash) &&
+    artifactHashesByPath[score.stderrPath] === score.stderrHash &&
+    phase42HasSha256Hash(artifactHashesByPath[score.stderrPath])
+  );
+}
+
+function phase42HasDiagnosticFallback(row, score) {
+  return row.scoring.claimBearing === false || Boolean(row.scoring.fallbackReason) || score?.mode === 'diagnostic_fallback';
+}
+
+function phase42HasLaneIsolationProof(row, isolation, policy) {
+  if (!isolation?.proven || !policy) return false;
+  if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) return false;
+  if (policy.laneId !== row.lane_id || isolation.laneId !== row.lane_id) return false;
+  if (isolation.expectedContextTool !== policy.expectedContextTool) return false;
+  if (isolation.allowedTools.length === 0 || isolation.observedTools.length === 0) return false;
+  if (isolation.violations && isolation.violations.length > 0) return false;
+  if (policy.disallowedTools.some((tool) => isolation.observedTools.includes(tool))) return false;
+  if (isolation.allowedTools.some((tool) => !policy.allowedTools.includes(tool))) return false;
+  if (policy.allowMultipleObservedTools) {
+    return isolation.observedTools.every((tool) => policy.allowedTools.includes(tool));
+  }
+  if (!isolation.allowedTools.includes(policy.expectedContextTool)) return false;
+  return isolation.observedTools.length === 1 && isolation.observedTools[0] === policy.expectedContextTool;
+}
+
+function phase42HasRunnerProvenance(row, rawTrace, expectedRunnerHash) {
+  if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash)
+    return false;
+  return (
+    rawTrace.executor === row.taskExecution.executor &&
+    rawTrace.model === row.taskExecution.model &&
+    rawTrace.runnerHash === expectedRunnerHash &&
+    row.hashes.runnerSourceHash === expectedRunnerHash
+  );
+}
+
+function phase42Failure(row, code, message) {
+  return {
+    code,
+    runId: row.run_id,
+    laneId: row.lane_id,
+    taskId: row.task_id,
+    repeatIndex: row.repeat_index,
+    message
+  };
+}
+
+function evaluatePhase42EvidenceGate(input) {
+  const failures = [];
+  const expectedKeys = new Set();
+  if (input.evidenceMode !== 'artifact_verified') {
+    failures.push({
+      code: 'artifact_verification_missing',
+      message: 'Synthetic shape evidence cannot produce claim-bearing benchmark pass.'
+    });
+  }
+  if (!input.protocol.claimAllowed) {
+    failures.push({
+      code: 'protocol_claims_disabled',
+      message: 'The protocol does not currently allow claim-bearing benchmark results.'
+    });
+  }
+  if (input.expectedTotalRows <= 0 || input.requiredLaneIds.length === 0 || input.requiredTaskIds.length === 0) {
+    failures.push({
+      code: 'denominator_contract_missing',
+      message: 'Claim validation requires a frozen denominator contract.'
+    });
+  }
+  if (input.rows.length !== input.expectedTotalRows) {
+    failures.push({
+      code: 'denominator_count_mismatch',
+      message: 'Run row count does not match the frozen expected denominator count.'
+    });
+  }
+  for (const laneId of input.requiredLaneIds) {
+    for (const taskId of input.requiredTaskIds) {
+      for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) {
+        expectedKeys.add(`${laneId}\u0000${taskId}\u0000${repeatIndex}`);
+      }
+    }
+  }
+  const rowCounts = new Map();
+  for (const row of input.rows) {
+    const key = phase42RowKey(row);
+    rowCounts.set(key, (rowCounts.get(key) ?? 0) + 1);
+    if (!expectedKeys.has(key)) {
+      failures.push(
+        phase42Failure(row, 'unexpected_run_row', 'Rows outside the required denominator must not be hidden from claim validation.')
+      );
+    }
+    if (row.protocol_hash !== input.expectedProtocolHash) {
+      failures.push(phase42Failure(row, 'protocol_hash_mismatch', 'Row protocol hash does not match the frozen protocol hash.'));
+    }
+    if (row.task_manifest_hash !== input.expectedTaskManifestHash) {
+      failures.push(phase42Failure(row, 'task_manifest_hash_mismatch', 'Row task manifest hash does not match the frozen task manifest hash.'));
+    }
+  }
+  for (const row of input.rows) {
+    if ((rowCounts.get(phase42RowKey(row)) ?? 0) > 1) {
+      failures.push(phase42Failure(row, 'duplicate_required_run', 'Duplicate lane/task/repeat rows make the evidence denominator ambiguous.'));
+    }
+  }
+  if (!input.expectedRunnerHash || !input.currentRunnerHash) {
+    failures.push({
+      code: 'runner_provenance_missing',
+      message: 'Expected and current runner hashes are required for claim-bearing validation.'
+    });
+  } else if (input.expectedRunnerHash !== input.currentRunnerHash) {
+    failures.push({
+      code: 'runner_provenance_mismatch',
+      message: 'Current runner hash does not match the expected generation runner hash.'
+    });
+  }
+  for (const laneId of input.requiredLaneIds) {
+    for (const taskId of input.requiredTaskIds) {
+      for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) {
+        const row = input.rows.find(
+          (candidate) =>
+            candidate.lane_id === laneId &&
+            candidate.task_id === taskId &&
+            candidate.repeat_index === repeatIndex
+        );
+        if (!row) {
+          failures.push({
+            code: 'missing_required_run',
+            laneId,
+            taskId,
+            repeatIndex,
+            message: 'A required lane/task/repeat row is missing from the evidence denominator.'
+          });
+          continue;
+        }
+        const artifacts = input.artifactsByRunId[row.run_id];
+        if (row.status !== 'completed') {
+          failures.push(phase42Failure(row, 'non_completed_status', 'Claim-bearing runs must complete.'));
+        }
+        if (
+          input.protocol.benchmarkTarget.officialEvaluatorFirst &&
+          !phase42HasOfficialEvaluatorProof(row, artifacts?.score, input.artifactHashesByPath)
+        ) {
+          failures.push(phase42Failure(row, 'official_evaluator_missing', 'Official evaluator proof is required before this row can support claims.'));
+        }
+        if (phase42HasDiagnosticFallback(row, artifacts?.score)) {
+          failures.push(phase42Failure(row, 'diagnostic_fallback_only', 'Diagnostic fallback scoring cannot satisfy the claim-bearing evidence gate.'));
+        }
+        if (!phase42HasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])) {
+          failures.push(
+            phase42Failure(
+              row,
+              artifacts?.laneIsolation?.violations?.length ? 'lane_isolation_violation' : 'lane_isolation_missing',
+              'Lane isolation must be proven by explicit allowed/observed tool evidence.'
+            )
+          );
+        }
+        if (!phase42HasMeasuredSetupIndex(row, artifacts?.setupIndex)) {
+          failures.push(phase42Failure(row, 'setup_index_cost_missing', 'Setup/index statuses, durations, and log references are required.'));
+        }
+        if (!phase42HasRunnerProvenance(row, artifacts?.rawTrace, input.expectedRunnerHash)) {
+          failures.push(phase42Failure(row, 'runner_provenance_mismatch', 'Raw trace executor/model metadata must match the manifest row.'));
+        }
+      }
+    }
+  }
+  const blockingFailures = failures.filter((failure) => failure.code !== 'artifact_verification_missing');
+  return {
+    shapePass: blockingFailures.length === 0,
+    claimPass: failures.length === 0,
+    diagnosticOnly: failures.length > 0,
+    failures
+  };
+}
+
+function countBy(values) {
+  return values.reduce((counts, value) => {
+    counts[value] = (counts[value] ?? 0) + 1;
+    return counts;
+  }, {});
+}
+
+function phase42LoaderFailures(loaded, sessionRoot) {
+  const failures = [];
+  const indexedPaths = new Set((loaded.session?.artifactIndex ?? []).map((artifact) => artifact.path));
+  const registeredArmIds = existsSync(FIXTURES.codebaseContextBaselineArms)
+    ? new Set((readJson(FIXTURES.codebaseContextBaselineArms).arms ?? []).map((arm) => arm.baselineArmId))
+    : new Set();
+  if (!loaded.session) {
+    failures.push({ code: 'session_missing', message: 'BASELINE-SESSION.json is required.' });
+  } else {
+    const expectedSessionHash = computeSessionHash(loaded.session);
+    if (loaded.session.sessionHash !== expectedSessionHash) {
+      failures.push({ code: 'session_hash_mismatch', message: 'Session hash does not match BASELINE-SESSION.json content.' });
+    }
+    if (loaded.session.sealed !== true) {
+      failures.push({ code: 'session_not_sealed', message: 'Claim-bearing Phase 42 verification requires a sealed session.' });
+    }
+    for (const artifact of loaded.session.artifactIndex ?? []) {
+      const artifactPath = join(sessionRoot, artifact.path);
+      if (!existsSync(artifactPath)) {
+        failures.push({ code: 'session_artifact_missing', path: artifact.path, message: 'Indexed session artifact is missing.' });
+      } else {
+        const actualHash = hashFile(artifactPath);
+        if (actualHash !== artifact.hash) {
+          failures.push({
+            code: 'session_artifact_hash_mismatch',
+            path: artifact.path,
+            message: 'Indexed session artifact hash does not match current file content.'
+          });
+        }
+      }
+    }
+  }
+  for (const error of loaded.readErrors) {
+    failures.push({
+      code: 'artifact_read_error',
+      runId: error.runId,
+      path: error.path,
+      message: `Required artifact could not be read: ${error.reason}`
+    });
+  }
+  for (const error of loaded.integrityErrors) {
+    failures.push({
+      code: 'artifact_hash_mismatch',
+      runId: error.runId,
+      path: error.path,
+      message: `Manifest artifact hash mismatch: ${error.reason}`
+    });
+  }
+  for (const row of [...loaded.requiredRows, ...loaded.supplementalRows, ...loaded.unexpectedRows]) {
+    for (const key of [
+      'raw_trace_path',
+      'structured_answer_path',
+      'trajectory_path',
+      'score_path',
+      'setup_index_path',
+      'prompt_path',
+      'lane_tool_card_path'
+    ]) {
+      const value = row[key];
+      const relativePath = value && isAbsolute(value) ? normalizePath(relative(sessionRoot, value)) : null;
+      if (!value || !isAbsolute(value)) {
+        failures.push(phase42Failure(row, 'artifact_path_invalid', `${key} must be absolute.`));
+      } else if (!isPathInside(sessionRoot, value)) {
+        failures.push(phase42Failure(row, 'artifact_path_outside_session', `${key} must stay inside the session root.`));
+      } else if (!relativePath || !indexedPaths.has(relativePath)) {
+        failures.push(phase42Failure(row, 'artifact_not_indexed', `${key} must be present in the sealed session artifact index.`));
+      }
+    }
+  }
+  for (const row of loaded.unexpectedRows) {
+    failures.push(
+      phase42Failure(
+        row,
+        'unexpected_run_row',
+        'Rows outside the required denominator must be explicit registered diagnostic arms.'
+      )
+    );
+  }
+  for (const row of loaded.supplementalRows) {
+    const baselineArmId = row.scoring?.baselineArmId;
+    if (
+      row.lane_id !== 'codebase-context' ||
+      row.scoring?.claimBearing !== false ||
+      typeof baselineArmId !== 'string' ||
+      !registeredArmIds.has(baselineArmId) ||
+      !row.run_id.startsWith(`${baselineArmId}-`)
+    ) {
+      failures.push(
+        phase42Failure(
+          row,
+          'invalid_supplemental_row',
+          'Supplemental diagnostic rows must be non-claim-bearing registered codebase-context arms.'
+        )
+      );
+    }
+  }
+  return failures;
+}
+
+function verifyPhase42Session(args) {
+  if (!args.session) throw new Error('--phase42-verify requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  const fixtures = validateFixtures();
+  const loaded = loadPhase42SessionEvidence(sessionRoot, fixtures);
+  const gate = evaluatePhase42EvidenceGate(loaded.gateInput);
+  const failures = [...gate.failures, ...phase42LoaderFailures(loaded, sessionRoot)];
+  const claimPass = failures.length === 0;
+  const shapePass = failures.filter((failure) => failure.code !== 'artifact_verification_missing').length === 0;
+  const failureCounts = countBy(failures.map((failure) => failure.code));
+  const report = {
+    generatedAt: new Date().toISOString(),
+    sessionRoot: normalizePath(sessionRoot),
+    claimBearing: claimPass,
+    claimPass,
+    shapePass,
+    diagnosticOnly: !claimPass,
+    protocolClaimAllowed: fixtures.protocol.claimAllowed,
+    expectedTotalRows: loaded.gateInput.expectedTotalRows,
+    requiredRows: loaded.requiredRows.length,
+    supplementalRows: loaded.supplementalRows.length,
+    unexpectedRows: loaded.unexpectedRows.length,
+    reservations: loaded.reservations.length,
+    sessionSealed: loaded.session?.sealed ?? false,
+    rowStatusCounts: countBy(loaded.requiredRows.map((row) => row.status)),
+    laneStatusCounts: countBy(loaded.requiredRows.map((row) => `${row.lane_id}:${row.status}`)),
+    failureCounts,
+    readErrors: loaded.readErrors,
+    integrityErrors: loaded.integrityErrors,
+    runnerHashes: {
+      expected: loaded.gateInput.expectedRunnerHash ?? null,
+      current: loaded.gateInput.currentRunnerHash ?? null
+    },
+    fixtureHashes: {
+      protocol: loaded.gateInput.expectedProtocolHash,
+      taskManifest: loaded.gateInput.expectedTaskManifestHash
+    },
+    safeClaims: claimPass
+      ? ['Phase 42 evidence gate passed for this sealed artifact set']
+      : [
+          'harness repair in progress',
+          'diagnostic artifact',
+          'non-claim-bearing provenance evidence',
+          'blocked pending verifier/challenger'
+        ],
+    blockedClaims: [
+      ...(claimPass ? [] : ['Phase 42 passed']),
+      'benchmark win',
+      'competitor loss',
+      'agent-outcome improvement',
+      'product change authorized by evidence',
+      'setup_failed is a loss'
+    ],
+    failures
+  };
+  if (args.out) writeJson(resolve(args.out), report);
+  if (args.quiet) {
+    console.log(
+      `phase42 verification ${claimPass ? 'passed' : 'failed'}: requiredRows=${report.requiredRows}/${report.expectedTotalRows}, supplementalRows=${report.supplementalRows}`
+    );
+  } else {
+    console.log(JSON.stringify(report, null, 2));
+  }
+  if (!claimPass) {
+    throw new Error(
+      `phase42 verification failed: ${Object.entries(failureCounts)
+        .map(([code, count]) => `${code}=${count}`)
+        .join(', ')}`
+    );
+  }
+}
+
+function validateBaselineSession(args) {
+  if (!args.session) throw new Error('--baseline-validate requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  const fixtures = validateFixtures();
+  const sessionPath = join(sessionRoot, 'BASELINE-SESSION.json');
+  const reservationPath = join(sessionRoot, 'slot-reservations.json');
+  const errors = [];
+  if (!existsSync(sessionPath)) errors.push('BASELINE-SESSION.json missing');
+  if (!existsSync(reservationPath)) errors.push('slot-reservations.json missing');
+  if (errors.length === 0) {
+    const session = readJson(sessionPath);
+    const expectedHash = computeSessionHash(session);
+    if (session.sessionHash !== expectedHash) errors.push('session hash mismatch');
+    if (session.claimBearing !== false) errors.push('session must be non-claim-bearing');
+    if (!session.snapshot?.snapshotHash) errors.push('snapshot hash missing');
+    if (
+      session.snapshot?.redactedEnvVarNames?.some(
+        (name) =>
+          String(process.env[name] ?? '').length > 0 &&
+          JSON.stringify(session).includes(String(process.env[name]))
+      )
+    ) {
+      errors.push('session appears to include an environment secret value');
+    }
+    for (const artifact of session.artifactIndex ?? []) {
+      const artifactPath = join(sessionRoot, artifact.path);
+      if (!existsSync(artifactPath)) errors.push(`indexed artifact missing: ${artifact.path}`);
+      else if (hashFile(artifactPath) !== artifact.hash)
+        errors.push(`indexed artifact hash mismatch: ${artifact.path}`);
+    }
+  }
+  const reservations = existsSync(reservationPath)
+    ? (readJson(reservationPath).reservations ?? [])
+    : [];
+  const expectedSlots =
+    fixtures.manifest.tasks.length *
+    fixtures.lanes.broadClaimLaneSet.length *
+    (fixtures.protocol.runPolicy?.claimBearingRunsPerTaskLane ?? 3);
+  if (reservations.length !== expectedSlots)
+    errors.push(`expected ${expectedSlots} reserved slots, found ${reservations.length}`);
+  const rows = readManifestRowsIfPresent(sessionRoot);
+  validateSessionPaths(sessionRoot, rows, errors);
+  const blockedReservations = reservations.filter(
+    (slot) => slot.status === 'terminal_missing_evidence'
+  );
+  const blockedRows = rows.filter(
+    (row) =>
+      row.status === 'setup_failed' && ['grepai', 'codebase-memory-mcp'].includes(row.lane_id)
+  );
+  if (blockedRows.length !== blockedReservations.length) {
+    errors.push('terminal missing-evidence rows must be present for every blocked reservation');
+  }
+  if (errors.length > 0)
+    throw new Error(`baseline session validation failed:\n- ${errors.join('\n- ')}`);
+  console.log('baseline session validation passed');
+}
+
+function sealBaselineSession(args) {
+  if (!args.session) throw new Error('--baseline-seal requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  const session = readSession(sessionRoot);
+  const reservations = readJson(join(sessionRoot, 'slot-reservations.json')).reservations ?? [];
+  const rows = readManifestRowsIfPresent(sessionRoot);
+  const rowKeys = new Set(rows.map((row) => `${row.lane_id}:${row.task_id}:${row.repeat_index}`));
+  const missing = reservations.filter(
+    (slot) => !rowKeys.has(`${slot.laneId}:${slot.taskId}:${slot.repeatIndex}`)
+  );
+  if (missing.length > 0)
+    throw new Error(`cannot seal baseline session; ${missing.length} slots lack terminal evidence`);
+  session.sealed = true;
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  writeSession(sessionRoot, session);
+  validateBaselineSession({ session: sessionRoot });
+  try {
+    verifyPhase42Session({ session: sessionRoot, quiet: true });
+  } catch (error) {
+    throw new Error(`baseline seal blocked by Phase 42 evidence gate: ${error.message}`);
+  }
+  console.log(`baseline session sealed ${join(sessionRoot, 'BASELINE-SESSION.json')}`);
+}
+
+function refreshBaselineSession(args) {
+  if (!args.session) throw new Error('--baseline-refresh requires --session <session-root>');
+  const sessionRoot = ensureBaselineSessionRoot(args.session);
+  const session = readSession(sessionRoot);
+  session.sealed = false;
+  session.artifactIndex = refreshArtifactIndex(sessionRoot);
+  writeSession(sessionRoot, session);
+  console.log(`baseline session refreshed ${join(sessionRoot, 'BASELINE-SESSION.json')}`);
+}
+
+function runDryRun(args) {
+  if (args.executor !== 'fake') throw new Error('--dry-run currently requires --executor fake');
+  if (!args.lane || !args.taskId || !args.out)
+    throw new Error('--dry-run requires --lane, --task-id, and --out');
+  const fixtures = validateFixtures();
+  const laneCard = fixtures.laneToolCards.cards.find((card) => card.laneId === args.lane);
+  if (!laneCard) throw new Error(`unknown lane: ${args.lane}`);
+  if (!laneCard.executableInPhase38)
+    throw new Error(`lane ${args.lane} is pending Phase 39 and is not executable in Phase 38`);
+  const task = fixtures.manifest.tasks.find((candidate) => candidate.instance_id === args.taskId);
+  if (!task) throw new Error(`unknown task-id: ${args.taskId}`);
+
+  const outDir = resolve(args.out);
+  const repeat = Number.isInteger(args.repeat) && args.repeat > 0 ? args.repeat : 1;
+  const runId = sanitize(`${laneCard.laneId}-${task.instance_id}-${repeat}-fake`);
+  const runDir = join(outDir, 'runs', runId);
+  const paths = {
+    prompt: join(runDir, 'prompt.txt'),
+    laneCard: join(runDir, 'lane-card.json'),
+    setupIndex: join(runDir, 'setup-index.json'),
+    rawTrace: join(runDir, 'raw-trace.json'),
+    structuredAnswer: join(runDir, 'structured-answer.json'),
+    trajectory: join(runDir, 'trajectory.json'),
+    score: join(runDir, 'score.json'),
+    manifest: join(outDir, 'run-manifest.jsonl')
+  };
+  const startedAt = new Date().toISOString();
+  const prompt = [
+    `Task: ${task.instance_id}`,
+    `Lane: ${laneCard.laneId}`,
+    'Return only structured JSON with answer, confidence, evidence, filesReferenced, symbolsReferenced, unsupportedClaims, readyToEdit.',
+    'Do not use tools outside the lane tool card.'
+  ].join('\n');
+  const answer = {
+    answer: { smoke: true, taskId: task.instance_id },
+    confidence: 'medium',
+    evidence: [
+      {
+        file: 'SMOKE_ONLY.md',
+        lineRange: { start: 1, end: 1 },
+        reason: 'fake executor non-claim-bearing smoke evidence'
+      }
+    ],
+    filesReferenced: ['SMOKE_ONLY.md'],
+    symbolsReferenced: [],
+    unsupportedClaims: [],
+    readyToEdit: false
+  };
+  const trajectory = buildTrajectory(task, answer);
+  const rawTrace = {
+    executor: 'fake',
+    runnerHash: runnerSourceHash(),
+    claimBearing: false,
+    stdout: JSON.stringify(answer),
+    stderr: '',
+    toolCalls: [],
+    laneIsolation: buildLaneIsolationEvidence(laneCard),
+    scriptedAgentDecisions: false
+  };
+  const score = {
+    status: 'completed',
+    mode: 'phase38_smoke_no_official_claim',
+    ...diagnosticFallbackScoring(fixtures, 'dry_run_fake_executor_smoke_only')
+  };
+  const setupIndex = {
+    setupCommand: laneCard.setupCommand,
+    indexCommand: laneCard.indexCommand,
+    setupDurationMs: 0,
+    indexDurationMs: 0,
+    setupStatus: laneCard.setupCommand === 'none' ? 'not_required' : 'completed',
+    indexStatus: laneCard.indexCommand === 'none' ? 'not_required' : 'completed'
+  };
+  mkdirSync(runDir, { recursive: true });
+  writeFileSync(paths.prompt, prompt, 'utf8');
+  writeJson(paths.laneCard, laneCard);
+  writeJson(paths.setupIndex, setupIndex);
+  writeJson(paths.rawTrace, rawTrace);
+  writeJson(paths.structuredAnswer, answer);
+  writeJson(paths.trajectory, trajectory);
+  writeJson(paths.score, score);
+  const completedAt = new Date().toISOString();
+  const row = {
+    run_id: runId,
+    protocol_version: fixtures.protocol.protocolVersion,
+    protocol_hash: hashObject(fixtures.protocol),
+    task_manifest_hash: fixtures.manifest.manifest_hash,
+    lane_id: laneCard.laneId,
+    task_id: task.instance_id,
+    repeat_index: repeat,
+    status: 'completed',
+    started_at: startedAt,
+    completed_at: completedAt,
+    raw_trace_path: paths.rawTrace,
+    structured_answer_path: paths.structuredAnswer,
+    trajectory_path: paths.trajectory,
+    score_path: paths.score,
+    setup_index_path: paths.setupIndex,
+    prompt_path: paths.prompt,
+    lane_tool_card_path: paths.laneCard,
+    setupIndex,
+    taskExecution: {
+      model: 'fake-executor',
+      timeoutSeconds: fixtures.protocol.budgets.defaults.timeoutSeconds,
+      maxContextTokens: fixtures.protocol.budgets.defaults.maxContextTokens,
+      maxAnswerTokens: fixtures.protocol.budgets.defaults.maxAnswerTokens,
+      startedAt,
+      completedAt,
+      taskWallTimeMs: new Date(completedAt).getTime() - new Date(startedAt).getTime(),
+      executor: 'fake'
+    },
+    scoring: diagnosticFallbackScoring(fixtures, 'dry_run_fake_executor_smoke_only'),
+    hashes: {
+      prompt: sha256(prompt),
+      laneToolCard: hashObject(laneCard),
+      structuredAnswer: hashObject(answer),
+      trajectory: hashObject(trajectory),
+      score: hashObject(score),
+      runnerSourceHash: runnerSourceHash()
+    }
+  };
+  mkdirSync(dirname(paths.manifest), { recursive: true });
+  appendFileSync(paths.manifest, `${JSON.stringify(row)}\n`, 'utf8');
+  console.log(`dry-run wrote ${runDir}`);
+}
+
+function runScoreProbe(args) {
+  if (!args.out) throw new Error('--score-probe requires --out <dir>');
+  const fixtures = validateFixtures();
+  const outDir = resolve(args.out);
+  const goldPath = join(outDir, 'synthetic-gold.json');
+  const predPath = join(outDir, 'synthetic-prediction.json');
+  const scorePath = join(outDir, 'score.json');
+  writeJson(goldPath, { synthetic: true, claimBearing: false });
+  writeJson(predPath, { synthetic: true, claimBearing: false });
+  const score = {
+    status: 'judge_failed',
+    mode: 'diagnostic_fallback',
+    stdout: '',
+    stderr: 'mock official evaluator unavailable in Phase 38 score probe',
+    exitStatus: 1,
+    ...diagnosticFallbackScoring(
+      fixtures,
+      'mocked_official_evaluator_failure_for_non_claim_probe'
+    )
+  };
+  writeJson(scorePath, score);
+  console.log(`score-probe wrote ${scorePath}`);
+}
+
+async function main() {
+  const args = parseArgs(process.argv.slice(2));
+  if (args.help || process.argv.length <= 2) {
+    help();
+    return;
+  }
+  if (args.validateFixtures) {
+    validateFixtures();
+    console.log('fixture validation passed');
+    return;
+  }
+  if (args.validateLaneSetup) {
+    validateLaneSetupEvidence();
+    console.log('lane setup validation passed');
+    return;
+  }
+  if (args.baselineSnapshot) {
+    createBaselineSnapshot(args);
+    return;
+  }
+  if (args.setupIndexMeasure) {
+    runSetupIndexMeasure(args);
+    return;
+  }
+  if (args.setupIndexImport) {
+    runSetupIndexImport(args);
+    return;
+  }
+  if (args.baselineRun) {
+    runBaseline(args);
+    return;
+  }
+  if (args.baselineRefresh) {
+    refreshBaselineSession(args);
+    return;
+  }
+  if (args.baselineRunCodebaseContextArms) {
+    runBaselineCodebaseContextArms(args);
+    return;
+  }
+  if (args.baselineSeal) {
+    sealBaselineSession(args);
+    return;
+  }
+  if (args.baselineValidate) {
+    validateBaselineSession(args);
+    return;
+  }
+  if (args.phase42Verify) {
+    verifyPhase42Session(args);
+    return;
+  }
+  if (args.baselineValidateArms) {
+    validateBaselineArms(args.baselineValidateArms);
+    return;
+  }
+  if (args.printClaudeArgs) {
+    console.log(JSON.stringify(claudeArgsForModel(args.model ?? ''), null, 2));
+    return;
+  }
+  if (args.printAnswerSchema) {
+    console.log(JSON.stringify(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA, null, 2));
+    return;
+  }
+  if (args.dryRun) {
+    runDryRun(args);
+    return;
+  }
+  if (args.scoreProbe) {
+    runScoreProbe(args);
+    return;
+  }
+  throw new Error('No mode selected. Use --help.');
+}
+
+main().catch((error) => {
+  console.error(error instanceof Error ? error.message : String(error));
+  process.exitCode = 1;
+});
diff --git a/src/eval/contextbench-answer.ts b/src/eval/contextbench-answer.ts
new file mode 100644
index 0000000..6c9b55d
--- /dev/null
+++ b/src/eval/contextbench-answer.ts
@@ -0,0 +1,229 @@
+import type {
+  ContextBenchEvidenceReference,
+  ContextBenchStructuredAnswer,
+  JsonSchemaDefinition,
+  JsonValue
+} from './contextbench-types.js';
+
+export interface StructuredAnswerParseResult {
+  status: 'valid' | 'invalid_schema';
+  answer: ContextBenchStructuredAnswer | null;
+  errors: string[];
+}
+
+export interface SchemaBoundDiagnostics {
+  missingRequiredFacts?: string[];
+  contradictoryFacts?: string[];
+  missingEvidenceFiles?: string[];
+  unsupportedEvidenceFiles?: string[];
+}
+
+export interface AnswerClassification {
+  unsupportedClaim: boolean;
+  falseReady: boolean;
+  reasons: string[];
+}
+
+const confidenceValues = new Set(['low', 'medium', 'high']);
+
+const evidenceReferenceFields = new Set(['file', 'lineRange', 'reason']);
+const lineRangeFields = new Set(['start', 'end']);
+
+export const CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS = [
+  'answer',
+  'confidence',
+  'evidence',
+  'filesReferenced',
+  'symbolsReferenced',
+  'unsupportedClaims',
+  'readyToEdit'
+] as const;
+
+export const CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA = {
+  type: 'object',
+  additionalProperties: false,
+  required: [...CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS],
+  properties: {
+    answer: { type: ['object', 'array', 'string', 'number', 'boolean', 'null'] },
+    confidence: { type: 'string', enum: ['low', 'medium', 'high'] },
+    evidence: {
+      type: 'array',
+      items: {
+        type: 'object',
+        additionalProperties: false,
+        required: ['file', 'lineRange', 'reason'],
+        properties: {
+          file: { type: 'string', minLength: 1 },
+          lineRange: {
+            type: 'object',
+            additionalProperties: false,
+            required: ['start', 'end'],
+            properties: {
+              start: { type: 'integer', minimum: 1 },
+              end: { type: 'integer', minimum: 1 }
+            }
+          },
+          reason: { type: 'string', minLength: 1 }
+        }
+      }
+    },
+    filesReferenced: { type: 'array', items: { type: 'string' } },
+    symbolsReferenced: { type: 'array', items: { type: 'string' } },
+    unsupportedClaims: { type: 'array', items: { type: 'string' } },
+    readyToEdit: { type: 'boolean' }
+  }
+} satisfies JsonSchemaDefinition;
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return value !== null && typeof value === 'object' && !Array.isArray(value);
+}
+
+function isStringArray(value: unknown): value is string[] {
+  return Array.isArray(value) && value.every((entry) => typeof entry === 'string');
+}
+
+function findAdditionalFields(
+  value: Record<string, unknown>,
+  allowedFields: ReadonlySet<string>,
+  prefix: string
+): string[] {
+  return Object.keys(value)
+    .filter((field) => !allowedFields.has(field))
+    .map((field) => `additional_${prefix}_${field}`);
+}
+
+function isJsonValue(value: unknown): value is JsonValue {
+  if (value === null) return true;
+  if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean')
+    return true;
+  if (Array.isArray(value)) return value.every(isJsonValue);
+  if (!isRecord(value)) return false;
+  return Object.values(value).every(isJsonValue);
+}
+
+export function isValidEvidenceReference(value: unknown): value is ContextBenchEvidenceReference {
+  if (!isRecord(value)) return false;
+  if (findAdditionalFields(value, evidenceReferenceFields, 'evidence_field').length > 0) return false;
+  const lineRange = value.lineRange;
+  if (!isRecord(lineRange)) return false;
+  if (findAdditionalFields(lineRange, lineRangeFields, 'line_range_field').length > 0)
+    return false;
+  const start = lineRange.start;
+  const end = lineRange.end;
+  return (
+    typeof value.file === 'string' &&
+    value.file.trim().length > 0 &&
+    typeof value.reason === 'string' &&
+    value.reason.trim().length > 0 &&
+    Number.isInteger(start) &&
+    Number.isInteger(end) &&
+    typeof start === 'number' &&
+    typeof end === 'number' &&
+    start > 0 &&
+    end >= start
+  );
+}
+
+function validateStructuredAnswer(value: unknown): StructuredAnswerParseResult {
+  const errors: string[] = [];
+  if (!isRecord(value)) {
+    return { status: 'invalid_schema', answer: null, errors: ['answer_root_not_object'] };
+  }
+
+  for (const field of CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS) {
+    if (!(field in value)) errors.push(`missing_${field}`);
+  }
+  errors.push(
+    ...findAdditionalFields(value, new Set(CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS), 'root_field')
+  );
+
+  if (!isJsonValue(value.answer)) errors.push('answer_not_json_value');
+  if (typeof value.confidence !== 'string' || !confidenceValues.has(value.confidence))
+    errors.push('invalid_confidence');
+  if (!Array.isArray(value.evidence)) errors.push('evidence_not_array');
+  if (!isStringArray(value.filesReferenced)) errors.push('files_referenced_not_string_array');
+  if (!isStringArray(value.symbolsReferenced)) errors.push('symbols_referenced_not_string_array');
+  if (!isStringArray(value.unsupportedClaims)) errors.push('unsupported_claims_not_string_array');
+  if (typeof value.readyToEdit !== 'boolean') errors.push('ready_to_edit_not_boolean');
+
+  const evidence = Array.isArray(value.evidence) ? value.evidence : [];
+  for (const entry of evidence) {
+    if (!isRecord(entry)) continue;
+    errors.push(...findAdditionalFields(entry, evidenceReferenceFields, 'evidence_field'));
+    if (isRecord(entry.lineRange)) {
+      errors.push(...findAdditionalFields(entry.lineRange, lineRangeFields, 'line_range_field'));
+    }
+  }
+  const malformedEvidence = evidence.some((entry) => !isValidEvidenceReference(entry));
+  if (malformedEvidence) errors.push('malformed_evidence_reference');
+
+  if (errors.length > 0) return { status: 'invalid_schema', answer: null, errors };
+
+  return {
+    status: 'valid',
+    answer: {
+      answer: value.answer as JsonValue,
+      confidence: value.confidence as ContextBenchStructuredAnswer['confidence'],
+      evidence: evidence as ContextBenchEvidenceReference[],
+      filesReferenced: value.filesReferenced as string[],
+      symbolsReferenced: value.symbolsReferenced as string[],
+      unsupportedClaims: value.unsupportedClaims as string[],
+      readyToEdit: value.readyToEdit as boolean
+    },
+    errors: []
+  };
+}
+
+export function parseStructuredAnswer(raw: string): StructuredAnswerParseResult {
+  const trimmed = raw.trim();
+  if (trimmed.length === 0)
+    return { status: 'invalid_schema', answer: null, errors: ['missing_json'] };
+  try {
+    return validateStructuredAnswer(JSON.parse(trimmed) as unknown);
+  } catch {
+    return { status: 'invalid_schema', answer: null, errors: ['invalid_json'] };
+  }
+}
+
+export function classifyStructuredAnswer(
+  answer: ContextBenchStructuredAnswer,
+  diagnostics: SchemaBoundDiagnostics = {}
+): AnswerClassification {
+  const reasons: string[] = [];
+  const malformedEvidence = answer.evidence.some((entry) => !isValidEvidenceReference(entry));
+  if (answer.unsupportedClaims.length > 0) reasons.push('model_reported_unsupported_claims');
+  if ((diagnostics.unsupportedEvidenceFiles?.length ?? 0) > 0)
+    reasons.push('unsupported_evidence_files');
+  if ((diagnostics.missingRequiredFacts?.length ?? 0) > 0) reasons.push('missing_required_facts');
+  if ((diagnostics.contradictoryFacts?.length ?? 0) > 0) reasons.push('contradictory_facts');
+  if ((diagnostics.missingEvidenceFiles?.length ?? 0) > 0) reasons.push('missing_evidence_files');
+
+  const unsupportedClaim = reasons.length > 0;
+  if (answer.readyToEdit && answer.confidence === 'low') reasons.push('ready_with_low_confidence');
+  if (answer.readyToEdit && answer.evidence.length === 0) reasons.push('ready_without_evidence');
+  if (answer.readyToEdit && malformedEvidence) reasons.push('ready_with_malformed_evidence');
+
+  const falseReady =
+    answer.readyToEdit &&
+    (unsupportedClaim ||
+      answer.confidence === 'low' ||
+      answer.evidence.length === 0 ||
+      malformedEvidence);
+  return { unsupportedClaim, falseReady, reasons: [...new Set(reasons)] };
+}
+
+export function evaluateSchemaBoundDiagnostics(
+  answer: ContextBenchStructuredAnswer,
+  expected: { requiredFacts?: string[]; requiredEvidenceFiles?: string[] }
+): SchemaBoundDiagnostics {
+  const answerText = JSON.stringify(answer.answer).toLowerCase();
+  const citedFiles = new Set(answer.evidence.map((entry) => entry.file));
+  return {
+    missingRequiredFacts: (expected.requiredFacts ?? []).filter(
+      (fact) => !answerText.includes(fact.toLowerCase())
+    ),
+    missingEvidenceFiles: (expected.requiredEvidenceFiles ?? []).filter(
+      (file) => !citedFiles.has(file)
+    )
+  };
+}
diff --git a/src/eval/contextbench-artifacts.ts b/src/eval/contextbench-artifacts.ts
new file mode 100644
index 0000000..e888b34
--- /dev/null
+++ b/src/eval/contextbench-artifacts.ts
@@ -0,0 +1,184 @@
+import { createHash } from 'node:crypto';
+import { appendFileSync, mkdirSync, readFileSync, statSync, writeFileSync } from 'node:fs';
+import path from 'node:path';
+import type {
+  ContextBenchArtifactIndexEntry,
+  ContextBenchExecutor,
+  ContextBenchLaneSetupEvidenceRecord,
+  ContextBenchLaneToolCard,
+  ContextBenchRunManifestRow,
+  ContextBenchTerminalStatus,
+  ContextBenchTaskIdentity
+} from './contextbench-types.js';
+
+export interface ArtifactPathSet {
+  runDir: string;
+  manifestPath: string;
+  promptPath: string;
+  laneToolCardPath: string;
+  setupIndexPath: string;
+  rawTracePath: string;
+  structuredAnswerPath: string;
+  trajectoryPath: string;
+  scorePath: string;
+}
+
+export function stableStringify(value: unknown): string {
+  if (value === null || typeof value !== 'object') return JSON.stringify(value);
+  if (Array.isArray(value)) return `[${value.map((item) => stableStringify(item)).join(',')}]`;
+  const entries = Object.entries(value as Record<string, unknown>).sort(([a], [b]) =>
+    a.localeCompare(b)
+  );
+  return `{${entries.map(([key, item]) => `${JSON.stringify(key)}:${stableStringify(item)}`).join(',')}}`;
+}
+
+export function sha256Text(value: string): string {
+  return `sha256:${createHash('sha256').update(value, 'utf8').digest('hex')}`;
+}
+
+export function sha256Buffer(value: Buffer): string {
+  return `sha256:${createHash('sha256').update(value).digest('hex')}`;
+}
+
+export function sha256File(filePath: string): string {
+  return sha256Buffer(readFileSync(filePath));
+}
+
+export function hashJson(value: unknown): string {
+  return sha256Text(stableStringify(value));
+}
+
+export function hashSetupEvidenceRecord(record: ContextBenchLaneSetupEvidenceRecord): string {
+  const evidenceWithoutHash: Omit<ContextBenchLaneSetupEvidenceRecord, 'evidenceHash'> = {
+    ...record
+  };
+  delete (evidenceWithoutHash as Partial<ContextBenchLaneSetupEvidenceRecord>).evidenceHash;
+  return hashJson(evidenceWithoutHash);
+}
+
+function sanitize(value: string): string {
+  return value.replace(/[^a-zA-Z0-9._-]+/g, '-').replace(/^-+|-+$/g, '');
+}
+
+export function buildRunId(params: {
+  laneId: string;
+  taskId: string;
+  repeatIndex: number;
+  executor: string;
+}): string {
+  const base = `${params.laneId}-${params.taskId}-${params.repeatIndex}-${params.executor}`;
+  return sanitize(base).slice(0, 160);
+}
+
+export function createArtifactPathSet(outDir: string, runId: string): ArtifactPathSet {
+  const runDir = path.join(outDir, 'runs', runId);
+  return {
+    runDir,
+    manifestPath: path.join(outDir, 'run-manifest.jsonl'),
+    promptPath: path.join(runDir, 'prompt.txt'),
+    laneToolCardPath: path.join(runDir, 'lane-card.json'),
+    setupIndexPath: path.join(runDir, 'setup-index.json'),
+    rawTracePath: path.join(runDir, 'raw-trace.json'),
+    structuredAnswerPath: path.join(runDir, 'structured-answer.json'),
+    trajectoryPath: path.join(runDir, 'trajectory.json'),
+    scorePath: path.join(runDir, 'score.json')
+  };
+}
+
+export function writeJsonArtifact(filePath: string, value: unknown): void {
+  mkdirSync(path.dirname(filePath), { recursive: true });
+  writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8');
+}
+
+export function artifactIndexEntry(
+  filePath: string,
+  rootDir: string
+): ContextBenchArtifactIndexEntry {
+  const stats = statSync(filePath);
+  return {
+    path: path.relative(rootDir, filePath).replace(/\\/g, '/'),
+    hash: sha256File(filePath),
+    bytes: stats.size
+  };
+}
+
+export function appendManifestRow(manifestPath: string, row: ContextBenchRunManifestRow): void {
+  mkdirSync(path.dirname(manifestPath), { recursive: true });
+  appendFileSync(manifestPath, `${JSON.stringify(row)}\n`, 'utf8');
+}
+
+export function readManifestRows(manifestPath: string): ContextBenchRunManifestRow[] {
+  const content = readFileSync(manifestPath, 'utf8').trim();
+  if (!content) return [];
+  return content.split('\n').map((line) => JSON.parse(line) as ContextBenchRunManifestRow);
+}
+
+export function buildManifestRow(params: {
+  runId: string;
+  protocolVersion: string;
+  protocolHash: string;
+  taskManifestHash: string;
+  laneCard: ContextBenchLaneToolCard;
+  task: ContextBenchTaskIdentity;
+  repeatIndex: number;
+  status: ContextBenchTerminalStatus;
+  startedAt: string;
+  completedAt: string;
+  paths: ArtifactPathSet;
+  hashes: Record<string, string>;
+  executor: ContextBenchExecutor;
+  model: string;
+  timeoutSeconds: number;
+  maxContextTokens: number;
+  maxAnswerTokens: number;
+}): ContextBenchRunManifestRow {
+  return {
+    run_id: params.runId,
+    protocol_version: params.protocolVersion,
+    protocol_hash: params.protocolHash,
+    task_manifest_hash: params.taskManifestHash,
+    lane_id: params.laneCard.laneId,
+    task_id: params.task.instance_id,
+    repeat_index: params.repeatIndex,
+    status: params.status,
+    started_at: params.startedAt,
+    completed_at: params.completedAt,
+    raw_trace_path: params.paths.rawTracePath,
+    structured_answer_path: params.paths.structuredAnswerPath,
+    trajectory_path: params.paths.trajectoryPath,
+    score_path: params.paths.scorePath,
+    setup_index_path: params.paths.setupIndexPath,
+    prompt_path: params.paths.promptPath,
+    lane_tool_card_path: params.paths.laneToolCardPath,
+    setupIndex: {
+      setupCommand: params.laneCard.setupCommand,
+      indexCommand: params.laneCard.indexCommand,
+      setupDurationMs: 0,
+      indexDurationMs: 0,
+      setupLogPath: params.paths.setupIndexPath,
+      indexLogPath: params.paths.setupIndexPath,
+      setupStatus: params.laneCard.setupCommand === 'none' ? 'not_required' : 'completed',
+      indexStatus: params.laneCard.indexCommand === 'none' ? 'not_required' : 'completed'
+    },
+    taskExecution: {
+      model: params.model,
+      timeoutSeconds: params.timeoutSeconds,
+      maxContextTokens: params.maxContextTokens,
+      maxAnswerTokens: params.maxAnswerTokens,
+      startedAt: params.startedAt,
+      completedAt: params.completedAt,
+      taskWallTimeMs: new Date(params.completedAt).getTime() - new Date(params.startedAt).getTime(),
+      executor: params.executor
+    },
+    scoring: {
+      officialEvaluatorFirst: false,
+      officialEvaluatorAttempted: false,
+      officialEvaluatorInvoked: false,
+      command:
+        'python -m contextbench.evaluate --gold <gold.parquet> --pred <trajectory.traj.json> --out <results.jsonl>',
+      claimBearing: false,
+      fallbackReason: 'phase38_smoke_non_claim_bearing'
+    },
+    hashes: params.hashes
+  };
+}
diff --git a/src/eval/contextbench-evidence-gate.ts b/src/eval/contextbench-evidence-gate.ts
new file mode 100644
index 0000000..815616e
--- /dev/null
+++ b/src/eval/contextbench-evidence-gate.ts
@@ -0,0 +1,422 @@
+import type { ContextBenchRunManifestRow } from './contextbench-types.js';
+
+export type ContextBenchEvidenceGateFailureCode =
+  | 'summary_not_claim_pass'
+  | 'artifact_verification_missing'
+  | 'protocol_claims_disabled'
+  | 'denominator_contract_missing'
+  | 'denominator_count_mismatch'
+  | 'protocol_hash_mismatch'
+  | 'task_manifest_hash_mismatch'
+  | 'missing_required_run'
+  | 'duplicate_required_run'
+  | 'unexpected_run_row'
+  | 'non_completed_status'
+  | 'official_evaluator_missing'
+  | 'diagnostic_fallback_only'
+  | 'lane_isolation_missing'
+  | 'lane_isolation_violation'
+  | 'setup_index_cost_missing'
+  | 'runner_provenance_missing'
+  | 'runner_provenance_mismatch';
+
+export interface ContextBenchEvidenceGateFailure {
+  code: ContextBenchEvidenceGateFailureCode;
+  runId?: string;
+  laneId?: string;
+  taskId?: string;
+  repeatIndex?: number;
+  message: string;
+}
+
+export interface ContextBenchEvidenceGateResult {
+  shapePass: boolean;
+  claimPass: boolean;
+  diagnosticOnly: boolean;
+  failures: ContextBenchEvidenceGateFailure[];
+}
+
+export type ContextBenchEvidenceMode = 'synthetic_shape' | 'artifact_verified';
+
+export interface ContextBenchLaneEvidencePolicy {
+  laneId: string;
+  expectedContextTool: string;
+  allowedTools: string[];
+  disallowedTools: string[];
+  allowMultipleObservedTools?: boolean;
+}
+
+export interface ContextBenchLaneIsolationEvidence {
+  laneId: string;
+  proven: boolean;
+  sourceKind?: 'not_captured' | 'env_override' | 'transcript' | 'proxy';
+  expectedContextTool: string;
+  allowedTools: string[];
+  observedTools: string[];
+  violations?: string[];
+}
+
+export interface ContextBenchRawTraceEvidence {
+  executor?: string;
+  model?: string;
+  runnerHash?: string;
+}
+
+export interface ContextBenchScoreEvidence {
+  status?: string;
+  mode?: string;
+  claimBearing?: boolean;
+  officialEvaluatorInvoked?: boolean;
+  command?: string;
+  exitCode?: number;
+  outputPath?: string;
+  outputHash?: string;
+  stdoutPath?: string;
+  stderrPath?: string;
+}
+
+export interface ContextBenchSetupIndexEvidence {
+  setupStatus?: string;
+  indexStatus?: string;
+  setupDurationMs?: number;
+  indexDurationMs?: number;
+  setupLogPath?: string;
+  indexLogPath?: string;
+}
+
+export interface ContextBenchRunEvidenceArtifacts {
+  rawTrace?: ContextBenchRawTraceEvidence;
+  score?: ContextBenchScoreEvidence;
+  setupIndex?: ContextBenchSetupIndexEvidence;
+  laneIsolation?: ContextBenchLaneIsolationEvidence;
+}
+
+export interface ContextBenchEvidenceGateInput {
+  evidenceMode: ContextBenchEvidenceMode;
+  protocol: {
+    claimAllowed: boolean;
+    benchmarkTarget: {
+      officialEvaluatorFirst: boolean;
+    };
+  };
+  requiredLaneIds: string[];
+  requiredTaskIds: string[];
+  requiredRepeats: number;
+  expectedTotalRows: number;
+  expectedProtocolHash: string;
+  expectedTaskManifestHash: string;
+  lanePoliciesById: Record<string, ContextBenchLaneEvidencePolicy>;
+  rows: ContextBenchRunManifestRow[];
+  artifactsByRunId: Record<string, ContextBenchRunEvidenceArtifacts>;
+  artifactHashesByPath: Record<string, string>;
+  expectedRunnerHash?: string;
+  currentRunnerHash?: string;
+}
+
+function makeFailure(
+  row: Pick<ContextBenchRunManifestRow, 'run_id' | 'lane_id' | 'task_id' | 'repeat_index'>,
+  code: ContextBenchEvidenceGateFailureCode,
+  message: string
+): ContextBenchEvidenceGateFailure {
+  return {
+    code,
+    runId: row.run_id,
+    laneId: row.lane_id,
+    taskId: row.task_id,
+    repeatIndex: row.repeat_index,
+    message
+  };
+}
+
+function hasMeasuredSetupIndex(
+  row: ContextBenchRunManifestRow,
+  evidence: ContextBenchSetupIndexEvidence | undefined
+): boolean {
+  if (!evidence) return false;
+  const setupDuration = evidence.setupDurationMs;
+  const indexDuration = evidence.indexDurationMs;
+  if (typeof setupDuration !== 'number' || typeof indexDuration !== 'number') return false;
+  if (!Number.isFinite(setupDuration) || !Number.isFinite(indexDuration)) return false;
+  if (!evidence.setupStatus || !evidence.indexStatus) return false;
+  if (!evidence.setupLogPath || !evidence.indexLogPath) return false;
+  if (!['completed', 'not_required'].includes(evidence.setupStatus)) return false;
+  if (!['completed', 'not_required'].includes(evidence.indexStatus)) return false;
+  if (evidence.setupStatus === 'completed' && setupDuration <= 0) return false;
+  if (evidence.indexStatus === 'completed' && indexDuration <= 0) return false;
+  if (row.setupIndex.setupStatus !== evidence.setupStatus) return false;
+  if (row.setupIndex.indexStatus !== evidence.indexStatus) return false;
+  if (row.setupIndex.setupDurationMs !== evidence.setupDurationMs) return false;
+  if (row.setupIndex.indexDurationMs !== evidence.indexDurationMs) return false;
+  if (row.setupIndex.setupLogPath !== evidence.setupLogPath) return false;
+  if (row.setupIndex.indexLogPath !== evidence.indexLogPath) return false;
+  return true;
+}
+
+function hasSha256Hash(value: string | undefined): boolean {
+  return /^sha256:[a-f0-9]{64}$/.test(value ?? '');
+}
+
+function hasOfficialEvaluatorProof(
+  row: ContextBenchRunManifestRow,
+  score: ContextBenchScoreEvidence | undefined,
+  artifactHashesByPath: Record<string, string>
+): boolean {
+  return (
+    row.scoring.officialEvaluatorFirst === true &&
+    row.scoring.officialEvaluatorAttempted === true &&
+    row.scoring.officialEvaluatorInvoked === true &&
+    row.scoring.claimBearing === true &&
+    score?.officialEvaluatorInvoked === true &&
+    score.claimBearing === true &&
+    score.mode === 'official_evaluator' &&
+    score.status === 'completed' &&
+    score.exitCode === 0 &&
+    typeof score.command === 'string' &&
+    score.command.includes('contextbench.evaluate') &&
+    typeof score.outputPath === 'string' &&
+    score.outputPath.length > 0 &&
+    hasSha256Hash(score.outputHash) &&
+    artifactHashesByPath[score.outputPath] === score.outputHash &&
+    hasSha256Hash(artifactHashesByPath[row.score_path]) &&
+    typeof score.stdoutPath === 'string' &&
+    score.stdoutPath.length > 0 &&
+    hasSha256Hash(artifactHashesByPath[score.stdoutPath]) &&
+    typeof score.stderrPath === 'string' &&
+    score.stderrPath.length > 0 &&
+    hasSha256Hash(artifactHashesByPath[score.stderrPath])
+  );
+}
+
+function hasDiagnosticFallback(row: ContextBenchRunManifestRow, score: ContextBenchScoreEvidence | undefined): boolean {
+  return row.scoring.claimBearing === false || Boolean(row.scoring.fallbackReason) || score?.mode === 'diagnostic_fallback';
+}
+
+function hasLaneIsolationProof(
+  row: ContextBenchRunManifestRow,
+  isolation: ContextBenchLaneIsolationEvidence | undefined,
+  policy: ContextBenchLaneEvidencePolicy | undefined
+): boolean {
+  if (!isolation?.proven) return false;
+  if (!policy) return false;
+  if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) return false;
+  if (policy.laneId !== row.lane_id) return false;
+  if (isolation.laneId !== row.lane_id) return false;
+  if (isolation.expectedContextTool !== policy.expectedContextTool) return false;
+  if (isolation.allowedTools.length === 0 || isolation.observedTools.length === 0) return false;
+  if (isolation.violations && isolation.violations.length > 0) return false;
+  if (policy.disallowedTools.some((tool) => isolation.observedTools.includes(tool))) return false;
+  if (isolation.allowedTools.some((tool) => !policy.allowedTools.includes(tool))) return false;
+  if (policy.allowMultipleObservedTools) {
+    return isolation.observedTools.every((tool) => policy.allowedTools.includes(tool));
+  }
+  if (!isolation.allowedTools.includes(policy.expectedContextTool)) return false;
+  if (isolation.observedTools.length !== 1) return false;
+  return isolation.observedTools[0] === policy.expectedContextTool;
+}
+
+function hasRunnerProvenance(
+  row: ContextBenchRunManifestRow,
+  rawTrace: ContextBenchRawTraceEvidence | undefined,
+  expectedRunnerHash: string | undefined
+): boolean {
+  if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash) return false;
+  return (
+    rawTrace.executor === row.taskExecution.executor &&
+    rawTrace.model === row.taskExecution.model &&
+    rawTrace.runnerHash === expectedRunnerHash &&
+    row.hashes.runnerSourceHash === expectedRunnerHash
+  );
+}
+
+function rowKey(row: Pick<ContextBenchRunManifestRow, 'lane_id' | 'task_id' | 'repeat_index'>): string {
+  return `${row.lane_id}\u0000${row.task_id}\u0000${row.repeat_index}`;
+}
+
+export function evaluateContextBenchEvidenceGate(
+  input: ContextBenchEvidenceGateInput
+): ContextBenchEvidenceGateResult {
+  const failures: ContextBenchEvidenceGateFailure[] = [];
+  const expectedKeys = new Set<string>();
+
+  if (input.evidenceMode !== 'artifact_verified') {
+    failures.push({
+      code: 'artifact_verification_missing',
+      message: 'Synthetic shape evidence cannot produce claim-bearing benchmark pass.'
+    });
+  }
+
+  if (!input.protocol.claimAllowed) {
+    failures.push({
+      code: 'protocol_claims_disabled',
+      message: 'The protocol does not currently allow claim-bearing benchmark results.'
+    });
+  }
+
+  if (input.expectedTotalRows <= 0 || input.requiredLaneIds.length === 0 || input.requiredTaskIds.length === 0) {
+    failures.push({
+      code: 'denominator_contract_missing',
+      message: 'Claim validation requires a frozen denominator contract.'
+    });
+  }
+
+  if (input.rows.length !== input.expectedTotalRows) {
+    failures.push({
+      code: 'denominator_count_mismatch',
+      message: 'Run row count does not match the frozen expected denominator count.'
+    });
+  }
+
+  for (const laneId of input.requiredLaneIds) {
+    for (const taskId of input.requiredTaskIds) {
+      for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) {
+        expectedKeys.add(`${laneId}\u0000${taskId}\u0000${repeatIndex}`);
+      }
+    }
+  }
+
+  const rowCounts = new Map<string, number>();
+  for (const row of input.rows) {
+    const key = rowKey(row);
+    rowCounts.set(key, (rowCounts.get(key) ?? 0) + 1);
+    if (!expectedKeys.has(key)) {
+      failures.push(
+        makeFailure(
+          row,
+          'unexpected_run_row',
+          'Rows outside the required denominator must not be hidden from claim validation.'
+        )
+      );
+    }
+    if (row.protocol_hash !== input.expectedProtocolHash) {
+      failures.push(
+        makeFailure(row, 'protocol_hash_mismatch', 'Row protocol hash does not match the frozen protocol hash.')
+      );
+    }
+    if (row.task_manifest_hash !== input.expectedTaskManifestHash) {
+      failures.push(
+        makeFailure(
+          row,
+          'task_manifest_hash_mismatch',
+          'Row task manifest hash does not match the frozen task manifest hash.'
+        )
+      );
+    }
+  }
+
+  for (const row of input.rows) {
+    if ((rowCounts.get(rowKey(row)) ?? 0) > 1) {
+      failures.push(
+        makeFailure(
+          row,
+          'duplicate_required_run',
+          'Duplicate lane/task/repeat rows make the evidence denominator ambiguous.'
+        )
+      );
+    }
+  }
+
+  if (!input.expectedRunnerHash || !input.currentRunnerHash) {
+    failures.push({
+      code: 'runner_provenance_missing',
+      message: 'Expected and current runner hashes are required for claim-bearing validation.'
+    });
+  } else if (input.expectedRunnerHash !== input.currentRunnerHash) {
+    failures.push({
+      code: 'runner_provenance_mismatch',
+      message: 'Current runner hash does not match the expected generation runner hash.'
+    });
+  }
+
+  for (const laneId of input.requiredLaneIds) {
+    for (const taskId of input.requiredTaskIds) {
+      for (let repeatIndex = 1; repeatIndex <= input.requiredRepeats; repeatIndex += 1) {
+        const matchingRows = input.rows.filter(
+          (candidate) =>
+            candidate.lane_id === laneId &&
+            candidate.task_id === taskId &&
+            candidate.repeat_index === repeatIndex
+        );
+        const row = matchingRows[0];
+
+        if (!row) {
+          failures.push({
+            code: 'missing_required_run',
+            laneId,
+            taskId,
+            repeatIndex,
+            message: 'A required lane/task/repeat row is missing from the evidence denominator.'
+          });
+          continue;
+        }
+
+        const artifacts = input.artifactsByRunId[row.run_id];
+        if (row.status !== 'completed') {
+          failures.push(makeFailure(row, 'non_completed_status', 'Claim-bearing runs must complete.'));
+        }
+
+        if (
+          input.protocol.benchmarkTarget.officialEvaluatorFirst &&
+          !hasOfficialEvaluatorProof(row, artifacts?.score, input.artifactHashesByPath)
+        ) {
+          failures.push(
+            makeFailure(
+              row,
+              'official_evaluator_missing',
+              'Official evaluator proof is required before this row can support claims.'
+            )
+          );
+        }
+
+        if (hasDiagnosticFallback(row, artifacts?.score)) {
+          failures.push(
+            makeFailure(
+              row,
+              'diagnostic_fallback_only',
+              'Diagnostic fallback scoring cannot satisfy the claim-bearing evidence gate.'
+            )
+          );
+        }
+
+        if (!hasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])) {
+          failures.push(
+            makeFailure(
+              row,
+              artifacts?.laneIsolation?.violations?.length ? 'lane_isolation_violation' : 'lane_isolation_missing',
+              'Lane isolation must be proven by explicit allowed/observed tool evidence.'
+            )
+          );
+        }
+
+        if (!hasMeasuredSetupIndex(row, artifacts?.setupIndex)) {
+          failures.push(
+            makeFailure(
+              row,
+              'setup_index_cost_missing',
+              'Setup/index statuses, durations, and log references are required.'
+            )
+          );
+        }
+
+        if (!hasRunnerProvenance(row, artifacts?.rawTrace, input.expectedRunnerHash)) {
+          failures.push(
+            makeFailure(
+              row,
+              'runner_provenance_mismatch',
+              'Raw trace executor/model metadata must match the manifest row.'
+            )
+          );
+        }
+      }
+    }
+  }
+
+  const blockingFailures = failures.filter((failure) => failure.code !== 'artifact_verification_missing');
+  const shapePass = blockingFailures.length === 0;
+  const claimPass = failures.length === 0;
+  return {
+    shapePass,
+    claimPass,
+    diagnosticOnly: !claimPass,
+    failures
+  };
+}
diff --git a/src/eval/contextbench-scoring.ts b/src/eval/contextbench-scoring.ts
new file mode 100644
index 0000000..8df61fb
--- /dev/null
+++ b/src/eval/contextbench-scoring.ts
@@ -0,0 +1,107 @@
+import { mkdirSync, writeFileSync } from 'node:fs';
+import path from 'node:path';
+import { classifyStructuredAnswer, evaluateSchemaBoundDiagnostics } from './contextbench-answer.js';
+import type { ContextBenchStructuredAnswer } from './contextbench-types.js';
+
+export interface ProcessRunResult {
+  status: number | null;
+  stdout: string;
+  stderr: string;
+}
+
+export type ContextBenchProcessRunner = (
+  command: string,
+  args: string[],
+  cwd?: string
+) => Promise<ProcessRunResult>;
+
+export interface OfficialEvaluatorParams {
+  goldPath: string;
+  predictionPath: string;
+  outputPath: string;
+  cachePath?: string;
+  cwd?: string;
+  runner: ContextBenchProcessRunner;
+}
+
+export interface ContextBenchScoreResult {
+  status: 'completed' | 'judge_failed';
+  mode: 'official_evaluator' | 'diagnostic_fallback';
+  claimBearing: boolean;
+  command: string;
+  stdout: string;
+  stderr: string;
+  exitStatus: number | null;
+  fallbackReason?: string;
+}
+
+export interface FactRecallDiagnosticResult {
+  missingRequiredFacts: string[];
+  missingEvidenceFiles: string[];
+  unsupportedClaim: boolean;
+  falseReady: boolean;
+  reasons: string[];
+}
+
+function writeJson(filePath: string, value: unknown): void {
+  mkdirSync(path.dirname(filePath), { recursive: true });
+  writeFileSync(filePath, `${JSON.stringify(value, null, 2)}\n`, 'utf8');
+}
+
+export async function scoreWithOfficialEvaluatorFirst(
+  params: OfficialEvaluatorParams
+): Promise<ContextBenchScoreResult> {
+  const args = [
+    '-m',
+    'contextbench.evaluate',
+    '--gold',
+    params.goldPath,
+    '--pred',
+    params.predictionPath
+  ];
+  if (params.cachePath) args.push('--cache', params.cachePath);
+  args.push('--out', params.outputPath);
+  const command = `python ${args.join(' ')}`;
+  const result = await params.runner('python', args, params.cwd);
+  if (result.status === 0) {
+    const score = {
+      status: 'completed' as const,
+      mode: 'official_evaluator' as const,
+      claimBearing: true,
+      command,
+      stdout: result.stdout,
+      stderr: result.stderr,
+      exitStatus: result.status
+    };
+    writeJson(params.outputPath, score);
+    return score;
+  }
+
+  const score = {
+    status: 'judge_failed' as const,
+    mode: 'diagnostic_fallback' as const,
+    claimBearing: false,
+    command,
+    stdout: result.stdout,
+    stderr: result.stderr,
+    exitStatus: result.status,
+    fallbackReason: 'official_evaluator_failed'
+  };
+  writeJson(params.outputPath, score);
+  return score;
+}
+
+export function runFactRecallDiagnostics(
+  answer: ContextBenchStructuredAnswer,
+  expected: { requiredFacts?: string[]; requiredEvidenceFiles?: string[] }
+): FactRecallDiagnosticResult {
+  const diagnostics = evaluateSchemaBoundDiagnostics(answer, expected);
+  const classification = classifyStructuredAnswer(answer, diagnostics);
+  return {
+    missingRequiredFacts: diagnostics.missingRequiredFacts ?? [],
+    missingEvidenceFiles: diagnostics.missingEvidenceFiles ?? [],
+    unsupportedClaim: classification.unsupportedClaim,
+    falseReady: classification.falseReady,
+    reasons: classification.reasons
+  };
+}
diff --git a/src/eval/contextbench-trajectory.ts b/src/eval/contextbench-trajectory.ts
new file mode 100644
index 0000000..8065bc8
--- /dev/null
+++ b/src/eval/contextbench-trajectory.ts
@@ -0,0 +1,77 @@
+import type {
+  ContextBenchPredSpan,
+  ContextBenchStructuredAnswer,
+  ContextBenchTaskIdentity,
+  ContextBenchTrajectoryRecord
+} from './contextbench-types.js';
+
+export interface NormalizeTrajectoryParams {
+  task: Pick<ContextBenchTaskIdentity, 'instance_id' | 'repo_url' | 'base_commit'>;
+  answer: ContextBenchStructuredAnswer;
+  repoRoot?: string;
+  rawTraceSteps?: Array<{ files?: string[] }>;
+}
+
+function normalizeSlashes(value: string): string {
+  return value.replace(/\\/g, '/');
+}
+
+export function normalizeContextBenchPath(filePath: string, repoRoot?: string): string {
+  let normalized = normalizeSlashes(filePath).replace(/^\.\//, '');
+  if (repoRoot) {
+    const root = normalizeSlashes(repoRoot).replace(/\/$/, '');
+    if (normalized.toLowerCase().startsWith(`${root.toLowerCase()}/`)) {
+      normalized = normalized.slice(root.length + 1);
+    }
+  }
+  return normalized.replace(/^\/+/, '');
+}
+
+function spanFromEvidence(
+  lineRange: ContextBenchStructuredAnswer['evidence'][number]['lineRange']
+): ContextBenchPredSpan {
+  return { start: lineRange.start, end: lineRange.end, full_file: false };
+}
+
+export function fullFileSpan(): ContextBenchPredSpan {
+  return { start: 1, end: null, full_file: true };
+}
+
+export function normalizeTrajectory(
+  params: NormalizeTrajectoryParams
+): ContextBenchTrajectoryRecord {
+  const spans: Record<string, ContextBenchPredSpan[]> = {};
+  const files = new Set<string>();
+
+  for (const evidence of params.answer.evidence) {
+    const file = normalizeContextBenchPath(evidence.file, params.repoRoot);
+    files.add(file);
+    spans[file] = [...(spans[file] ?? []), spanFromEvidence(evidence.lineRange)];
+  }
+
+  for (const fileRef of params.answer.filesReferenced) {
+    const file = normalizeContextBenchPath(fileRef, params.repoRoot);
+    if (file.length === 0) continue;
+    files.add(file);
+    if (!spans[file]) spans[file] = [fullFileSpan()];
+  }
+
+  const predFiles = [...files].sort();
+  const traceFiles = (params.rawTraceSteps ?? [])
+    .flatMap((step) => step.files ?? [])
+    .map((file) => normalizeContextBenchPath(file, params.repoRoot))
+    .filter((file) => file.length > 0);
+  const stepFiles = [...new Set([...traceFiles, ...predFiles])].sort();
+
+  return {
+    instance_id: params.task.instance_id,
+    repo_url: params.task.repo_url,
+    commit: params.task.base_commit,
+    traj_data: {
+      pred_steps: [{ files: stepFiles, spans }],
+      pred_files: predFiles,
+      pred_spans: spans
+    },
+    model_patch: ''
+  };
+}
diff --git a/src/eval/contextbench-types.ts b/src/eval/contextbench-types.ts
new file mode 100644
index 0000000..515d234
--- /dev/null
+++ b/src/eval/contextbench-types.ts
@@ -0,0 +1,434 @@
+export type JsonPrimitive = string | number | boolean | null;
+export type JsonValue = JsonPrimitive | JsonValue[] | { [key: string]: JsonValue };
+
+export type JsonSchemaPrimitiveType =
+  | 'array'
+  | 'boolean'
+  | 'integer'
+  | 'null'
+  | 'number'
+  | 'object'
+  | 'string';
+
+export interface JsonSchemaDefinition {
+  type?: JsonSchemaPrimitiveType | JsonSchemaPrimitiveType[];
+  properties?: Record<string, JsonSchemaDefinition>;
+  items?: JsonSchemaDefinition;
+  required?: string[];
+  additionalProperties?: boolean | JsonSchemaDefinition;
+  enum?: JsonValue[];
+  minLength?: number;
+  minimum?: number;
+}
+
+export type ContextBenchTerminalStatus =
+  | 'completed'
+  | 'setup_failed'
+  | 'task_setup_failed'
+  | 'index_failed'
+  | 'timeout'
+  | 'invalid_schema'
+  | 'no_answer'
+  | 'wrong_answer'
+  | 'wrong_evidence'
+  | 'unsupported_claim'
+  | 'false_ready'
+  | 'tool_error'
+  | 'judge_failed';
+
+export const CONTEXTBENCH_TERMINAL_STATUSES: readonly ContextBenchTerminalStatus[] = [
+  'completed',
+  'setup_failed',
+  'task_setup_failed',
+  'index_failed',
+  'timeout',
+  'invalid_schema',
+  'no_answer',
+  'wrong_answer',
+  'wrong_evidence',
+  'unsupported_claim',
+  'false_ready',
+  'tool_error',
+  'judge_failed'
+];
+
+export interface ContextBenchTaskIdentity {
+  instance_id: string;
+  original_inst_id: string;
+  source: string;
+  language: string;
+  repo: string;
+  repo_url: string;
+  base_commit: string;
+  problem_statement_ref: string;
+  problem_statement_hash: string;
+  gold_context_ref: string;
+  gold_context_hash: string;
+  patch_hash: string;
+  test_patch_hash: string;
+  f2p_hash: string;
+  p2p_hash: string;
+  gold_context_span_count: number;
+  hash_canonicalization_version: string;
+  hardness_signal_status: string;
+  hardness_signal_source: string;
+  hardness_proxy_used: boolean;
+  inclusion_rationale: string;
+  deterministic_rank: string;
+}
+
+export interface ContextBenchTaskManifest {
+  name: string;
+  protocolVersion: string;
+  dataset: string;
+  datasetConfig: string;
+  split: string;
+  claimBearing: boolean;
+  selectedInPhase: number;
+  selection_algorithm: string;
+  selection_seed_or_deterministic_order: string;
+  selection_timestamp: string;
+  task_pool_hash: string;
+  exclusion_log_path: string;
+  hash_canonicalization_version: string;
+  evaluator_success_status: string;
+  hardness_signal_status: string;
+  hardness_signal_source: string;
+  hardness_proxy_used: boolean;
+  forbidden_selection_sources: string[];
+  no_lane_outputs_observed_attestation: string;
+  tasks: ContextBenchTaskIdentity[];
+  manifest_hash: string;
+}
+
+export interface ContextBenchProtocol {
+  protocolVersion: string;
+  claimAllowed: boolean;
+  benchmarkTarget: {
+    officialEvaluatorFirst: boolean;
+    officialEvaluatorCommand: string;
+    fallbackScorerPolicy: {
+      claimBearing: boolean;
+    };
+  };
+  structuredAnswerSchema: {
+    requiredFields: string[];
+    confidenceValues: string[];
+    evidenceFields: string[];
+    invalidSchemaStatus: 'invalid_schema';
+  };
+  budgets: {
+    setupAndIndexingReportedSeparately: boolean;
+    defaults: {
+      maxContextTokens: number;
+      maxAnswerTokens: number;
+      timeoutSeconds: number;
+    };
+  };
+  failureTaxonomy: ContextBenchTerminalStatus[];
+  runManifestSchema: {
+    appendOnly: boolean;
+    requiredFields: string[];
+    terminalStatuses: ContextBenchTerminalStatus[];
+    failedRunsIncludedInAggregates: boolean;
+  };
+}
+
+export interface ContextBenchLane {
+  laneId: string;
+  displayName: string;
+  contextTool: string;
+  allowedTools: string[];
+  disallowedTools: string[];
+  nativeToolsAllowed: boolean;
+  setupCostReportedSeparately: boolean;
+  indexCostReportedSeparately: boolean;
+  cacheIsolationRequired: boolean;
+}
+
+export interface ContextBenchLaneToolCard {
+  laneId: string;
+  displayName: string;
+  phase38Status: string;
+  phase39Status?: ContextBenchLaneReadinessStatus;
+  executableInPhase38: boolean;
+  contextTools: string[];
+  allowedTools: string[];
+  disallowedTools: string[];
+  setupCommand: string;
+  indexCommand: string;
+  queryCommand: string;
+  versionCommand: string;
+  cachePath: string;
+  artifactPaths: {
+    setup: string;
+    rawTrace: string;
+    structuredAnswer: string;
+    trajectory: string;
+    score: string;
+  };
+  setupCostReportedSeparately: boolean;
+  indexCostReportedSeparately: boolean;
+  claimBearing: boolean;
+}
+
+export type ContextBenchLaneReadinessStatus =
+  | 'ready_for_phase40'
+  | 'setup_failed'
+  | 'index_failed'
+  | 'tool_error'
+  | 'invasive_setup_blocked'
+  | 'pending';
+
+export const CONTEXTBENCH_LANE_READINESS_STATUSES: readonly ContextBenchLaneReadinessStatus[] = [
+  'ready_for_phase40',
+  'setup_failed',
+  'index_failed',
+  'tool_error',
+  'invasive_setup_blocked',
+  'pending'
+];
+
+export type ContextBenchLaneCommandKind = 'setup' | 'index' | 'query' | 'version';
+
+export interface ContextBenchLaneCommandEvidence {
+  kind: ContextBenchLaneCommandKind;
+  command: string;
+  cwd: string;
+  safeToRunAutomatically: boolean;
+  exitCode: number | null;
+  status: 'not_required' | 'not_run_documented' | 'succeeded' | 'failed' | 'blocked';
+  durationMs: number | null;
+  stdoutLogPath: string | null;
+  stderrLogPath: string | null;
+  outputHash: string | null;
+}
+
+export interface ContextBenchLaneSetupEvidenceRecord {
+  laneId: string;
+  readinessStatus: ContextBenchLaneReadinessStatus;
+  docsUrl: string;
+  sourceUrl: string;
+  workingDirectory: string;
+  platform: {
+    os: string;
+    shell: string;
+    runtime: string;
+  };
+  redactedEnvVars: string[];
+  commands: ContextBenchLaneCommandEvidence[];
+  setupDurationMs: number | null;
+  indexDurationMs: number | null;
+  setupStatus: 'not_required' | 'ready' | 'failed' | 'blocked' | 'pending';
+  indexStatus: 'not_required' | 'ready' | 'failed' | 'blocked' | 'pending';
+  logReference: string | null;
+  evidenceHash: string;
+  nextHumanAction: string;
+  claimBearing: false;
+}
+
+export interface ContextBenchLaneSetupEvidenceFixture {
+  name: string;
+  protocolVersion: string;
+  phase: 39;
+  claimBearing: false;
+  generatedOutputsPolicy: string;
+  records: ContextBenchLaneSetupEvidenceRecord[];
+}
+
+export type ContextBenchBaselineSlotStatus = 'reserved' | 'attempted' | 'terminal_missing_evidence';
+
+export interface ContextBenchArtifactIndexEntry {
+  path: string;
+  hash: string;
+  bytes: number;
+}
+
+export interface ContextBenchCommandTranscriptEntry {
+  command: string;
+  cwd: string;
+  exitCode: number | null;
+  stdoutPath: string | null;
+  stderrPath: string | null;
+  outputHash: string | null;
+}
+
+export interface ContextBenchUntrackedSnapshotEntry {
+  path: string;
+  bytes: number | null;
+  mtimeMs: number | null;
+  hash: string | null;
+  disposition: 'hashed' | 'excluded';
+  exclusionReason: string | null;
+}
+
+export interface ContextBenchDirtyWorktreeSnapshot {
+  branch: string;
+  head: string;
+  divergence: {
+    status: 'unavailable' | 'available';
+    reason: string;
+  };
+  gitStatusPath: string;
+  trackedDiffPath: string;
+  stagedDiffPath: string;
+  diffStatPath: string;
+  untracked: ContextBenchUntrackedSnapshotEntry[];
+  lockfiles: ContextBenchArtifactIndexEntry[];
+  redactedEnvVarNames: string[];
+  versions: Record<string, string>;
+  fixtureHashes: Record<string, string>;
+  commandTranscript: ContextBenchCommandTranscriptEntry[];
+  snapshotHash: string;
+}
+
+export interface ContextBenchBaselineSlotReservation {
+  laneId: string;
+  taskId: string;
+  repeatIndex: number;
+  status: ContextBenchBaselineSlotStatus;
+  terminalStatus: ContextBenchTerminalStatus | null;
+  reason: string | null;
+}
+
+export interface ContextBenchBaselineSession {
+  sessionId: string;
+  phase: 40;
+  createdAt: string;
+  updatedAt: string;
+  sessionRoot: string;
+  claimBearing: false;
+  sealed: boolean;
+  snapshot: ContextBenchDirtyWorktreeSnapshot;
+  reservationsPath: string;
+  runManifestPath: string;
+  artifactIndex: ContextBenchArtifactIndexEntry[];
+  sessionHash: string;
+}
+
+export interface ContextBenchCodebaseContextBaselineArm {
+  baselineArmId: string;
+  laneId: 'codebase-context';
+  sourceIdentity: string;
+  allowedToolSurfaces: string[];
+  versionOrSourceRef: string;
+  setupCommand: string;
+  claimBearing: false;
+  failurePolicy: 'record_terminal_diagnostic_failure';
+}
+
+export interface ContextBenchCodebaseContextBaselineArmsFixture {
+  name: string;
+  protocolVersion: string;
+  phase: 40;
+  claimBearing: false;
+  denominatorPolicy: string;
+  arms: ContextBenchCodebaseContextBaselineArm[];
+}
+
+export interface ContextBenchEvidenceReference {
+  file: string;
+  lineRange: {
+    start: number;
+    end: number;
+  };
+  reason: string;
+}
+
+export type ContextBenchConfidence = 'low' | 'medium' | 'high';
+
+export interface ContextBenchStructuredAnswer {
+  answer: JsonValue;
+  confidence: ContextBenchConfidence;
+  evidence: ContextBenchEvidenceReference[];
+  filesReferenced: string[];
+  symbolsReferenced: string[];
+  unsupportedClaims: string[];
+  readyToEdit: boolean;
+}
+
+export interface ContextBenchSetupIndexMetadata {
+  setupCommand: string;
+  indexCommand: string;
+  setupDurationMs: number;
+  indexDurationMs: number;
+  setupLogPath: string;
+  indexLogPath: string;
+  setupStatus: 'not_required' | 'completed' | 'setup_failed';
+  indexStatus: 'not_required' | 'completed' | 'index_failed';
+  taskMaterializationStatus?: 'not_required' | 'completed' | 'failed';
+  taskMaterializationErrors?: string[];
+}
+
+export type ContextBenchExecutor = 'fake' | 'claude' | 'codex' | 'gemini' | 'opencode';
+
+export interface ContextBenchTaskExecutionMetadata {
+  model: string;
+  timeoutSeconds: number;
+  maxContextTokens: number;
+  maxAnswerTokens: number;
+  startedAt: string;
+  completedAt: string;
+  taskWallTimeMs: number;
+  executor: ContextBenchExecutor;
+}
+
+export interface ContextBenchScoringMetadata {
+  officialEvaluatorFirst: boolean;
+  officialEvaluatorAttempted?: boolean;
+  officialEvaluatorInvoked?: boolean;
+  command: string;
+  claimBearing: boolean;
+  fallbackReason?: string;
+  stdoutPath?: string;
+  stderrPath?: string;
+}
+
+export interface ContextBenchRunManifestRow {
+  run_id: string;
+  protocol_version: string;
+  protocol_hash: string;
+  task_manifest_hash: string;
+  lane_id: string;
+  task_id: string;
+  repeat_index: number;
+  status: ContextBenchTerminalStatus;
+  started_at: string;
+  completed_at: string;
+  raw_trace_path: string;
+  structured_answer_path: string;
+  trajectory_path: string;
+  score_path: string;
+  setup_index_path: string;
+  prompt_path: string;
+  lane_tool_card_path: string;
+  setupIndex: ContextBenchSetupIndexMetadata;
+  taskExecution: ContextBenchTaskExecutionMetadata;
+  scoring: ContextBenchScoringMetadata;
+  hashes: Record<string, string>;
+}
+
+export interface ContextBenchPredSpan {
+  start: number;
+  end: number | null;
+  full_file: boolean;
+}
+
+export interface ContextBenchTrajectoryRecord {
+  instance_id: string;
+  repo_url: string;
+  commit: string;
+  traj_data: {
+    pred_steps: Array<{
+      files: string[];
+      spans: Record<string, ContextBenchPredSpan[]>;
+    }>;
+    pred_files: string[];
+    pred_spans: Record<string, ContextBenchPredSpan[]>;
+  };
+  model_patch: string;
+}
+
+export function isContextBenchTerminalStatus(value: string): value is ContextBenchTerminalStatus {
+  return CONTEXTBENCH_TERMINAL_STATUSES.includes(value as ContextBenchTerminalStatus);
+}
diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts
new file mode 100644
index 0000000..41436fd
--- /dev/null
+++ b/tests/contextbench-baseline-runner.test.ts
@@ -0,0 +1,1095 @@
+import { execFileSync, spawnSync } from 'node:child_process';
+import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import { describe, expect, it, vi } from 'vitest';
+import manifestFixture from './fixtures/contextbench-task-manifest.json';
+
+type ManifestRow = {
+  run_id: string;
+  lane_id: string;
+  task_id: string;
+  repeat_index: number;
+  status: string;
+  raw_trace_path: string;
+  setupIndex: {
+    setupStatus: string;
+    indexStatus: string;
+    setupDurationMs?: number;
+    indexDurationMs?: number;
+    setupLogPath?: string;
+    indexLogPath?: string;
+    taskWallTimeMs?: number;
+  };
+  taskExecution: { executor: string; taskWallTimeMs: number };
+  hashes: { runnerSourceHash?: string };
+  scoring: {
+    claimBearing: boolean;
+    fallbackReason?: string;
+    officialEvaluatorFirst?: boolean;
+    officialEvaluatorAttempted?: boolean;
+    officialEvaluatorInvoked?: boolean;
+    stdoutPath?: string;
+    stderrPath?: string;
+  };
+};
+
+type TaskManifest = { tasks: Array<{ instance_id: string }> };
+
+const manifest = manifestFixture as TaskManifest;
+vi.setConfig({ testTimeout: 30000 });
+
+function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
+  return path.join(
+    mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-runner-`)),
+    'benchmark-runs',
+    'contextbench',
+    phase,
+    'runner-smoke'
+  );
+}
+
+function readRows(sessionRoot: string): ManifestRow[] {
+  return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8')
+    .trim()
+    .split('\n')
+    .map((line) => JSON.parse(line) as ManifestRow);
+}
+
+function createCleanGitRepo(root: string): string {
+  const repoPath = path.join(root, 'repo');
+  mkdirSync(repoPath, { recursive: true });
+  writeFileSync(path.join(repoPath, 'README.md'), '# ContextBench fixture\n', 'utf8');
+  execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' });
+  execFileSync('git', ['add', 'README.md'], { cwd: repoPath, encoding: 'utf8' });
+  execFileSync(
+    'git',
+    ['-c', 'user.name=ContextBench Test', '-c', 'user.email=contextbench@example.invalid', 'commit', '-m', 'fixture'],
+    { cwd: repoPath, encoding: 'utf8' }
+  );
+  return repoPath;
+}
+
+function writePayloadFile(root: string, taskId: string, repoCheckoutPath: string): string {
+  const payloadPath = path.join(root, 'TASK-PAYLOADS.json');
+  writeFileSync(
+    payloadPath,
+    JSON.stringify(
+      {
+        tasksById: {
+          [taskId]: {
+            problem_statement: 'Use the fixture repository to answer with cited evidence.',
+            repo_checkout_path: repoCheckoutPath
+          }
+        }
+      },
+      null,
+      2
+    ),
+    'utf8'
+  );
+  return payloadPath;
+}
+
+function writeStubClaude(root: string): string {
+  const stubPath = path.join(root, 'stub-claude.cjs');
+  writeFileSync(
+    stubPath,
+    `const answer = { type: 'result', structured_output: { answer: 'fixture answer', confidence: 'medium', evidence: [{ file: 'README.md', lineRange: { start: 1, end: 1 }, reason: 'fixture evidence' }], filesReferenced: ['README.md'], symbolsReferenced: [], unsupportedClaims: [], readyToEdit: false } }; process.stdout.write(JSON.stringify(answer));`,
+    'utf8'
+  );
+  return stubPath;
+}
+
+function writeStubEvaluator(root: string, exitCode: 0 | 1, output = JSON.stringify({ score: 1 })): string {
+  const stubPath = path.join(root, `stub-evaluator-${exitCode}.cjs`);
+  const serializedOutput = JSON.stringify(output);
+  writeFileSync(
+    stubPath,
+    `const fs = require('node:fs'); const predIndex = process.argv.indexOf('--pred'); if (predIndex < 0 || !fs.existsSync(process.argv[predIndex + 1])) { process.stderr.write('missing prediction artifact'); process.exit(2); } const outIndex = process.argv.indexOf('--out'); if (outIndex >= 0 && process.argv[outIndex + 1] && ${exitCode} === 0) fs.writeFileSync(process.argv[outIndex + 1], ${serializedOutput} + '\\n'); process.stdout.write('official evaluator stub'); process.exit(${exitCode});`,
+    'utf8'
+  );
+  return stubPath;
+}
+
+describe('ContextBench Phase 40 baseline runner', () => {
+  it('reserves every required slot and writes terminal missing-evidence rows for blocked lanes', () => {
+    const sessionRoot = tempSessionRoot();
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      const reservations = JSON.parse(
+        readFileSync(path.join(sessionRoot, 'slot-reservations.json'), 'utf8')
+      ) as { reservations: Array<{ laneId: string; status: string; terminalStatus: string | null }> };
+      expect(reservations.reservations).toHaveLength(20 * 6 * 3);
+      const blocked = reservations.reservations.filter((slot) => slot.status === 'terminal_missing_evidence');
+      expect(blocked).toHaveLength(20 * 2 * 3);
+      expect([...new Set(blocked.map((slot) => slot.laneId))].sort()).toEqual([
+        'codebase-memory-mcp',
+        'grepai'
+      ]);
+      expect(blocked.every((slot) => slot.terminalStatus === 'setup_failed')).toBe(true);
+
+      const rows = readRows(sessionRoot);
+      expect(rows.filter((row) => row.status === 'setup_failed')).toHaveLength(blocked.length);
+      expect(rows.every((row) => row.scoring.claimBearing === false)).toBe(true);
+      expect(rows.every((row) => row.scoring.officialEvaluatorFirst === false)).toBe(true);
+      expect(rows.every((row) => row.scoring.officialEvaluatorAttempted === false)).toBe(true);
+      expect(rows.every((row) => row.scoring.officialEvaluatorInvoked === false)).toBe(true);
+      expect(rows.every((row) => !('taskWallTimeMs' in row.setupIndex))).toBe(true);
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const validateOutput = execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      expect(validateOutput).toContain('baseline session validation passed');
+      const rows = readRows(sessionRoot);
+      const attempt = rows.find(
+        (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt).toBeTruthy();
+      expect(attempt).toMatchObject({ status: 'completed', lane_id: 'raw-native' });
+      expect(attempt?.taskExecution.executor).toBe('fake');
+      expect(attempt?.setupIndex.setupStatus).toBe('not_required');
+      expect(attempt?.scoring).toMatchObject({
+        claimBearing: false,
+        officialEvaluatorFirst: false,
+        officialEvaluatorAttempted: false,
+        officialEvaluatorInvoked: false
+      });
+      const rawTrace = JSON.parse(readFileSync(attempt?.raw_trace_path ?? '', 'utf8')) as {
+        runnerHash?: string;
+        laneIsolation?: { proven: boolean; proofSource: string; observedTools: string[] };
+        scriptedAgentDecisions: boolean;
+        antiScriptingBoundary: string[];
+      };
+      expect(rawTrace.runnerHash).toMatch(/^sha256:[a-f0-9]{64}$/);
+      expect(attempt?.hashes.runnerSourceHash).toBe(rawTrace.runnerHash);
+      expect(rawTrace.laneIsolation).toMatchObject({
+        proven: false,
+        proofSource: 'not_captured',
+        observedTools: []
+      });
+      expect(rawTrace.scriptedAgentDecisions).toBe(false);
+      expect(rawTrace.antiScriptingBoundary).toEqual(expect.arrayContaining(['file_selection']));
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('records official evaluator invocation metadata for overridden live executor attempts', () => {
+    const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-'));
+    const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke');
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createCleanGitRepo(tempRoot);
+    const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
+    const stubClaude = writeStubClaude(tempRoot);
+    const stubEvaluator = writeStubEvaluator(tempRoot, 0);
+    const env = {
+      ...process.env,
+      CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
+      CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
+      CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
+        'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
+      })
+    };
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8',
+        env
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--task-payloads',
+          payloadPath,
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8', env }
+      );
+      const rows = readRows(sessionRoot);
+      const attempt = rows.find(
+        (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('completed');
+      expect(attempt?.taskExecution.executor).toBe('claude');
+      expect(attempt?.scoring).toMatchObject({
+        claimBearing: false,
+        officialEvaluatorFirst: true,
+        officialEvaluatorAttempted: true,
+        officialEvaluatorInvoked: true
+      });
+      expect(attempt?.scoring.command).toContain('--out');
+      const score = JSON.parse(readFileSync(attempt?.score_path ?? '', 'utf8')) as {
+        mode: string;
+        exitCode?: number;
+        outputHash?: string;
+        stdoutPath?: string;
+        stderrPath?: string;
+      };
+      expect(score.mode).toBe('official_evaluator');
+      expect(score.exitCode).toBe(0);
+      expect(score.outputHash).toMatch(/^sha256:[a-f0-9]{64}$/);
+      expect(score.stdoutPath).toBeTruthy();
+      expect(score.stderrPath).toBeTruthy();
+      const rawTrace = JSON.parse(readFileSync(attempt?.raw_trace_path ?? '', 'utf8')) as {
+        laneIsolation?: { proven: boolean; proofSource: string; observedTools: string[] };
+      };
+      expect(rawTrace.laneIsolation).toMatchObject({
+        proven: true,
+        proofSource: 'stubbed_test_proxy',
+        observedTools: ['native-read']
+      });
+    } finally {
+      rmSync(tempRoot, { recursive: true, force: true });
+    }
+  });
+
+  it('rejects malformed official evaluator output as judge_failed diagnostic evidence', () => {
+    const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-malformed-'));
+    const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke');
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createCleanGitRepo(tempRoot);
+    const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
+    const stubClaude = writeStubClaude(tempRoot);
+    const stubEvaluator = writeStubEvaluator(tempRoot, 0, 'not json');
+    const env = {
+      ...process.env,
+      CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
+      CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
+      CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
+        'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
+      })
+    };
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8',
+        env
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--task-payloads',
+          payloadPath,
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8', env }
+      );
+      const attempt = readRows(sessionRoot).find(
+        (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('judge_failed');
+      expect(attempt?.scoring).toMatchObject({
+        claimBearing: false,
+        fallbackReason: 'official_evaluator_malformed_jsonl',
+        officialEvaluatorAttempted: true,
+        officialEvaluatorInvoked: true
+      });
+      const score = JSON.parse(readFileSync(attempt?.score_path ?? '', 'utf8')) as {
+        mode: string;
+        fallbackReason: string;
+        outputHash?: string;
+      };
+      expect(score.mode).toBe('diagnostic_fallback');
+      expect(score.fallbackReason).toBe('official_evaluator_malformed_jsonl');
+      expect(score.outputHash).toMatch(/^sha256:[a-f0-9]{64}$/);
+    } finally {
+      rmSync(tempRoot, { recursive: true, force: true });
+    }
+  });
+
+  it('rejects non-object or wrong-task official evaluator JSONL as diagnostic evidence', () => {
+    const cases = [
+      { output: '1', reason: 'official_evaluator_non_object_jsonl' },
+      {
+        output: JSON.stringify({ instance_id: 'wrong-task-id', score: 1 }),
+        reason: 'official_evaluator_task_mismatch'
+      }
+    ];
+
+    for (const testCase of cases) {
+      const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-envelope-'));
+      const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke');
+      const taskId = manifest.tasks[0].instance_id;
+      const repoPath = createCleanGitRepo(tempRoot);
+      const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
+      const stubClaude = writeStubClaude(tempRoot);
+      const stubEvaluator = writeStubEvaluator(tempRoot, 0, testCase.output);
+      const env = {
+        ...process.env,
+        CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
+        CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
+        CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
+          'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
+        })
+      };
+      try {
+        execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+          encoding: 'utf8',
+          env
+        });
+        execFileSync(
+          'node',
+          [
+            'scripts/contextbench-runner.mjs',
+            '--baseline-run',
+            '--session',
+            sessionRoot,
+            '--executor',
+            'claude',
+            '--task-payloads',
+            payloadPath,
+            '--lane',
+            'raw-native',
+            '--task-id',
+            taskId,
+            '--repeat',
+            '1'
+          ],
+          { encoding: 'utf8', env }
+        );
+        const attempt = readRows(sessionRoot).find(
+          (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1
+        );
+        expect(attempt?.status).toBe('judge_failed');
+        expect(attempt?.scoring.fallbackReason).toBe(testCase.reason);
+      } finally {
+        rmSync(tempRoot, { recursive: true, force: true });
+      }
+    }
+  });
+
+  it('records official evaluator failure as judge_failed without making claims', () => {
+    const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-official-evaluator-fail-'));
+    const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke');
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createCleanGitRepo(tempRoot);
+    const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
+    const stubClaude = writeStubClaude(tempRoot);
+    const stubEvaluator = writeStubEvaluator(tempRoot, 1);
+    const env = {
+      ...process.env,
+      CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
+      CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator])
+    };
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8',
+        env
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--task-payloads',
+          payloadPath,
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8', env }
+      );
+      const rows = readRows(sessionRoot);
+      const attempt = rows.find(
+        (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('judge_failed');
+      expect(attempt?.scoring).toMatchObject({
+        claimBearing: false,
+        fallbackReason: 'official_evaluator_missing_output',
+        officialEvaluatorFirst: true,
+        officialEvaluatorAttempted: true,
+        officialEvaluatorInvoked: true
+      });
+      const score = JSON.parse(readFileSync(attempt?.score_path ?? '', 'utf8')) as {
+        mode: string;
+        claimBearing: boolean;
+        exitCode: number;
+        exitStatus: number;
+      };
+      expect(score.mode).toBe('diagnostic_fallback');
+      expect(score.claimBearing).toBe(false);
+      expect(score.exitCode).toBe(1);
+      expect(score.exitStatus).toBe(1);
+    } finally {
+      rmSync(tempRoot, { recursive: true, force: true });
+    }
+  });
+
+  it('chunks all-ready-lane execution with max-attempts so live runs are resumable', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--all-ready-lanes',
+          '--repeats',
+          '3',
+          '--max-attempts',
+          '2'
+        ],
+        { encoding: 'utf8' }
+      );
+      const rows = readRows(sessionRoot);
+      const attemptedRows = rows.filter(
+        (row) => row.status === 'completed' && row.taskExecution.executor === 'fake'
+      );
+      expect(attemptedRows).toHaveLength(2);
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--all-ready-lanes',
+          '--repeats',
+          '3',
+          '--max-attempts',
+          '2'
+        ],
+        { encoding: 'utf8' }
+      );
+      const resumedRows = readRows(sessionRoot).filter(
+        (row) => row.status === 'completed' && row.taskExecution.executor === 'fake'
+      );
+      expect(resumedRows).toHaveLength(4);
+      const session = JSON.parse(
+        readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8')
+      ) as { phase: number };
+      expect(session.phase).toBe(41);
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('measures raw-native setup/index as a session artifact and reuses it in attempt rows', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--setup-index-measure',
+          '--session',
+          sessionRoot,
+          '--lane',
+          'raw-native'
+        ],
+        { encoding: 'utf8' }
+      );
+      const measurement = JSON.parse(
+        readFileSync(path.join(sessionRoot, 'setup-index', 'raw-native', 'setup-index.json'), 'utf8')
+      ) as { claimBearing: boolean; setupStatus: string; indexStatus: string; setupLogPath: string };
+      expect(measurement).toMatchObject({
+        claimBearing: false,
+        setupStatus: 'not_required',
+        indexStatus: 'not_required'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const attempt = readRows(sessionRoot).find(
+        (row) => row.lane_id === 'raw-native' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('completed');
+      expect(attempt?.setupIndex).toMatchObject({
+        setupStatus: 'not_required',
+        indexStatus: 'not_required',
+        setupDurationMs: 0,
+        indexDurationMs: 0,
+        setupLogPath: measurement.setupLogPath
+      });
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('fails a ready non-raw lane closed when setup/index measurement is missing', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--lane',
+          'codebase-context',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const attempt = readRows(sessionRoot).find(
+        (row) => row.lane_id === 'codebase-context' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('setup_failed');
+      expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement');
+      expect(attempt?.setupIndex.setupStatus).toBe('setup_failed');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('imports setup/index evidence for ready non-raw lanes before task execution', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      const logsDir = path.join(sessionRoot, 'manual-setup-index-logs', 'codebase-context');
+      mkdirSync(logsDir, { recursive: true });
+      const setupLogPath = path.join(logsDir, 'setup.stdout.log');
+      const indexLogPath = path.join(logsDir, 'index.stdout.log');
+      writeFileSync(setupLogPath, 'setup completed\n', 'utf8');
+      writeFileSync(indexLogPath, 'index completed\n', 'utf8');
+      const importPath = path.join(path.dirname(sessionRoot), 'codebase-context-setup-index-import.json');
+      writeFileSync(
+        importPath,
+        JSON.stringify(
+          {
+            laneId: 'codebase-context',
+            claimBearing: false,
+            setupStatus: 'completed',
+            indexStatus: 'completed',
+            setupDurationMs: 12,
+            indexDurationMs: 34,
+            setupLogPath,
+            indexLogPath
+          },
+          null,
+          2
+        ),
+        'utf8'
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--setup-index-import',
+          '--session',
+          sessionRoot,
+          '--lane',
+          'codebase-context',
+          '--input',
+          importPath
+        ],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--lane',
+          'codebase-context',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const attempt = readRows(sessionRoot).find(
+        (row) => row.lane_id === 'codebase-context' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('completed');
+      expect(attempt?.setupIndex).toMatchObject({
+        setupStatus: 'completed',
+        indexStatus: 'completed',
+        setupDurationMs: 12,
+        indexDurationMs: 34,
+        setupLogPath,
+        indexLogPath
+      });
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('rejects forged or failed setup/index imports before non-raw task execution', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      const logsDir = path.join(sessionRoot, 'manual-setup-index-logs', 'codebase-context');
+      mkdirSync(logsDir, { recursive: true });
+      const setupLogPath = path.join(logsDir, 'setup.stdout.log');
+      const indexLogPath = path.join(logsDir, 'index.stdout.log');
+      writeFileSync(setupLogPath, 'setup failed\n', 'utf8');
+      writeFileSync(indexLogPath, 'index skipped\n', 'utf8');
+      const wrongLaneImport = path.join(path.dirname(sessionRoot), 'wrong-lane-import.json');
+      writeFileSync(
+        wrongLaneImport,
+        JSON.stringify({
+          laneId: 'raw-native',
+          claimBearing: false,
+          setupStatus: 'completed',
+          indexStatus: 'completed',
+          setupDurationMs: 1,
+          indexDurationMs: 1,
+          setupLogPath,
+          indexLogPath
+        }),
+        'utf8'
+      );
+      const wrongLane = spawnSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--setup-index-import',
+          '--session',
+          sessionRoot,
+          '--lane',
+          'codebase-context',
+          '--input',
+          wrongLaneImport
+        ],
+        { encoding: 'utf8' }
+      );
+      expect(wrongLane.status).toBe(1);
+      expect(wrongLane.stderr).toContain('laneId mismatch');
+
+      const siblingDir = `${sessionRoot}-evil`;
+      mkdirSync(siblingDir, { recursive: true });
+      const siblingSetupLog = path.join(siblingDir, 'setup.stdout.log');
+      const siblingIndexLog = path.join(siblingDir, 'index.stdout.log');
+      writeFileSync(siblingSetupLog, 'setup forged\n', 'utf8');
+      writeFileSync(siblingIndexLog, 'index forged\n', 'utf8');
+      const outsideImport = path.join(path.dirname(sessionRoot), 'outside-import.json');
+      writeFileSync(
+        outsideImport,
+        JSON.stringify({
+          laneId: 'codebase-context',
+          claimBearing: false,
+          setupStatus: 'completed',
+          indexStatus: 'completed',
+          setupDurationMs: 1,
+          indexDurationMs: 1,
+          setupLogPath: siblingSetupLog,
+          indexLogPath: siblingIndexLog
+        }),
+        'utf8'
+      );
+      const outside = spawnSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--setup-index-import',
+          '--session',
+          sessionRoot,
+          '--lane',
+          'codebase-context',
+          '--input',
+          outsideImport
+        ],
+        { encoding: 'utf8' }
+      );
+      expect(outside.status).toBe(1);
+      expect(outside.stderr).toContain('inside session root');
+
+      const failedImport = path.join(path.dirname(sessionRoot), 'failed-import.json');
+      writeFileSync(
+        failedImport,
+        JSON.stringify({
+          laneId: 'codebase-context',
+          claimBearing: false,
+          setupStatus: 'setup_failed',
+          indexStatus: 'not_required',
+          setupDurationMs: 0,
+          indexDurationMs: 0,
+          setupLogPath,
+          indexLogPath
+        }),
+        'utf8'
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--setup-index-import',
+          '--session',
+          sessionRoot,
+          '--lane',
+          'codebase-context',
+          '--input',
+          failedImport
+        ],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--lane',
+          'codebase-context',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const attempt = readRows(sessionRoot).find(
+        (row) => row.lane_id === 'codebase-context' && row.task_id === taskId && row.repeat_index === 1
+      );
+      expect(attempt?.status).toBe('setup_failed');
+      expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('validates diagnostic codebase-context baseline arms as non-claim-bearing side evidence', () => {
+    const output = execFileSync(
+      'node',
+      [
+        'scripts/contextbench-runner.mjs',
+        '--baseline-validate-arms',
+        'tests/fixtures/contextbench-codebase-context-baseline-arms.json'
+      ],
+      { encoding: 'utf8' }
+    );
+    expect(output).toContain('baseline arm validation passed');
+  });
+
+  it('can record diagnostic codebase-context arm smoke rows separate from required reservations', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run-codebase-context-arms',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--task-id',
+          taskId,
+          '--repeats',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const rows = readRows(sessionRoot);
+      const diagnosticRows = rows.filter((row) => row.run_id.startsWith('codebase-context-current'));
+      expect(diagnosticRows.length).toBeGreaterThanOrEqual(3);
+      expect(diagnosticRows.every((row) => row.lane_id === 'codebase-context')).toBe(true);
+      expect(diagnosticRows.every((row) => row.scoring.claimBearing === false)).toBe(true);
+      const reservations = JSON.parse(
+        readFileSync(path.join(sessionRoot, 'slot-reservations.json'), 'utf8')
+      ) as { reservations: unknown[] };
+      expect(reservations.reservations).toHaveLength(20 * 6 * 3);
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('runs Phase 42 verification as read-only artifact-derived evidence and fails diagnostic sessions closed', () => {
+    const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-phase42-verify-'));
+    const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke');
+    const reportPath = path.join(tempRoot, 'phase42-report.json');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const sessionBefore = readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8');
+      const result = spawnSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--phase42-verify',
+          '--session',
+          sessionRoot,
+          '--out',
+          reportPath,
+          '--quiet'
+        ],
+        { encoding: 'utf8' }
+      );
+      expect(result.status).toBe(1);
+      expect(result.stdout).toContain('phase42 verification failed');
+      expect(result.stderr).toContain('phase42 verification failed');
+      const report = JSON.parse(readFileSync(reportPath, 'utf8')) as {
+        claimPass: boolean;
+        diagnosticOnly: boolean;
+        protocolClaimAllowed: boolean;
+        expectedTotalRows: number;
+        requiredRows: number;
+        supplementalRows: number;
+        failureCounts: Record<string, number>;
+        blockedClaims: string[];
+      };
+      expect(report).toMatchObject({
+        claimPass: false,
+        diagnosticOnly: true,
+        protocolClaimAllowed: false,
+        expectedTotalRows: 20 * 6 * 3,
+        requiredRows: 20 * 2 * 3 + 1,
+        supplementalRows: 0
+      });
+      expect(report.failureCounts.protocol_claims_disabled).toBe(1);
+      expect(report.failureCounts.denominator_count_mismatch).toBe(1);
+      expect(report.failureCounts.official_evaluator_missing).toBeGreaterThan(0);
+      expect(report.failureCounts.missing_required_run).toBeGreaterThan(0);
+      expect(report.blockedClaims).toContain('Phase 42 passed');
+      expect(readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8')).toBe(sessionBefore);
+    } finally {
+      rmSync(tempRoot, { recursive: true, force: true });
+    }
+  });
+
+  it('classifies diagnostic baseline arms as supplemental during Phase 42 verification', () => {
+    const tempRoot = mkdtempSync(path.join(tmpdir(), 'contextbench-phase42-arms-'));
+    const sessionRoot = path.join(tempRoot, 'benchmark-runs', 'contextbench', 'phase41', 'runner-smoke');
+    const reportPath = path.join(tempRoot, 'phase42-report.json');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run-codebase-context-arms',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--task-id',
+          taskId,
+          '--repeats',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const result = spawnSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--phase42-verify',
+          '--session',
+          sessionRoot,
+          '--out',
+          reportPath
+        ],
+        { encoding: 'utf8' }
+      );
+      expect(result.status).toBe(1);
+      const report = JSON.parse(readFileSync(reportPath, 'utf8')) as {
+        requiredRows: number;
+        supplementalRows: number;
+        failureCounts: Record<string, number>;
+      };
+      expect(report.requiredRows).toBe(20 * 2 * 3);
+      expect(report.supplementalRows).toBeGreaterThanOrEqual(3);
+      expect(report.failureCounts.unexpected_run_row ?? 0).toBe(0);
+      expect(report.failureCounts.denominator_count_mismatch).toBe(1);
+    } finally {
+      rmSync(tempRoot, { recursive: true, force: true });
+    }
+  });
+
+  it('blocks baseline seal when terminal row completeness lacks Phase 42 claim evidence', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--all-ready-lanes',
+          '--repeats',
+          '3'
+        ],
+        { encoding: 'utf8' }
+      );
+      expect(readRows(sessionRoot)).toHaveLength(20 * 6 * 3);
+
+      const result = spawnSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-seal', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+
+      expect(result.status).toBe(1);
+      expect(result.stdout).toContain('baseline session validation passed');
+      expect(result.stdout).toContain('phase42 verification failed');
+      expect(result.stderr).toContain('baseline seal blocked by Phase 42 evidence gate');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+});
diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts
new file mode 100644
index 0000000..a1b808d
--- /dev/null
+++ b/tests/contextbench-baseline-schema-gate.test.ts
@@ -0,0 +1,944 @@
+import { execFileSync } from 'node:child_process';
+import { chmodSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import { describe, expect, it, vi } from 'vitest';
+import {
+  CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA,
+  CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS,
+  parseStructuredAnswer
+} from '../src/eval/contextbench-answer.js';
+import manifestFixture from './fixtures/contextbench-task-manifest.json';
+
+type ManifestRow = {
+  run_id: string;
+  status: string;
+  raw_trace_path: string;
+  structured_answer_path: string;
+  trajectory_path: string;
+  scoring: { claimBearing: boolean };
+};
+
+type TaskManifest = { tasks: Array<{ instance_id: string; base_commit: string }> };
+
+const manifest = manifestFixture as TaskManifest;
+vi.setConfig({ testTimeout: 30000 });
+
+function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
+  return path.join(
+    mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)),
+    'benchmark-runs',
+    'contextbench',
+    phase,
+    'schema-gate-smoke'
+  );
+}
+
+function readRows(sessionRoot: string): ManifestRow[] {
+  return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8')
+    .trim()
+    .split('\n')
+    .map((line) => JSON.parse(line) as ManifestRow);
+}
+
+function createClaudeStub(
+  stdout: string,
+  capture?: { cwdPath?: string; stdinPath?: string }
+): { stubDir: string; env: NodeJS.ProcessEnv } {
+  const stubDir = mkdtempSync(path.join(tmpdir(), 'contextbench-claude-stub-'));
+  const stubScript = path.join(stubDir, 'claude-stub.cjs');
+  writeFileSync(
+    stubScript,
+    [
+      "const fs = require('node:fs');",
+      "if (process.env.CLAUDE_STUB_CWD_PATH) fs.writeFileSync(process.env.CLAUDE_STUB_CWD_PATH, process.cwd(), 'utf8');",
+      "let stdin = '';",
+      "process.stdin.setEncoding('utf8');",
+      "process.stdin.on('data', (chunk) => { stdin += chunk; });",
+      "process.stdin.on('end', () => {",
+      "  if (process.env.CLAUDE_STUB_STDIN_PATH) fs.writeFileSync(process.env.CLAUDE_STUB_STDIN_PATH, stdin, 'utf8');",
+      "  process.stdout.write(process.env.CLAUDE_STUB_STDOUT || '');",
+      '});',
+      'process.stdin.resume();'
+    ].join('\n'),
+    'utf8'
+  );
+  writeFileSync(
+    path.join(stubDir, 'claude.cmd'),
+    '@echo off\r\nnode "%~dp0claude-stub.cjs"\r\n',
+    'utf8'
+  );
+  const shellStub = path.join(stubDir, 'claude');
+  writeFileSync(shellStub, '#!/bin/sh\nnode "$(dirname "$0")/claude-stub.cjs"\n', 'utf8');
+  chmodSync(shellStub, 0o755);
+  return {
+    stubDir,
+    env: {
+      ...process.env,
+      PATH: `${stubDir}${path.delimiter}${process.env.PATH ?? ''}`,
+      Path: `${stubDir}${path.delimiter}${process.env.Path ?? process.env.PATH ?? ''}`,
+      CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubScript]),
+      CLAUDE_STUB_STDOUT: stdout,
+      CLAUDE_STUB_CWD_PATH: capture?.cwdPath,
+      CLAUDE_STUB_STDIN_PATH: capture?.stdinPath
+    }
+  };
+}
+
+function writeTaskPayloads(
+  filePath: string,
+  taskId: string,
+  payload: Record<string, unknown>
+): void {
+  writeFileSync(
+    filePath,
+    `${JSON.stringify({ tasks: [{ instance_id: taskId, ...payload }] }, null, 2)}\n`,
+    'utf8'
+  );
+}
+
+function createGitCheckout(): string {
+  const repoPath = mkdtempSync(path.join(tmpdir(), 'contextbench-task-repo-'));
+  execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' });
+  execFileSync(
+    'git',
+    [
+      '-c',
+      'user.name=ContextBench Test',
+      '-c',
+      'user.email=contextbench@example.invalid',
+      'commit',
+      '--allow-empty',
+      '-m',
+      'init'
+    ],
+    { cwd: repoPath, encoding: 'utf8' }
+  );
+  return repoPath;
+}
+
+function structuredStubAnswer(): Record<string, unknown> {
+  return {
+    answer: { adapterSmoke: true },
+    confidence: 'medium',
+    evidence: [
+      { file: 'README.md', lineRange: { start: 1, end: 1 }, reason: 'stubbed adapter evidence' }
+    ],
+    filesReferenced: ['README.md'],
+    symbolsReferenced: [],
+    unsupportedClaims: [],
+    readyToEdit: false
+  };
+}
+
+function createAdapterStub(
+  executor: 'codex' | 'gemini' | 'opencode',
+  capture?: { cwdPath?: string; argsPath?: string }
+): { stubDir: string; env: NodeJS.ProcessEnv } {
+  const stubDir = mkdtempSync(path.join(tmpdir(), `contextbench-${executor}-stub-`));
+  const stubScript = path.join(stubDir, `${executor}-stub.cjs`);
+  writeFileSync(
+    stubScript,
+    [
+      "const fs = require('node:fs');",
+      'const executor = process.env.ADAPTER_STUB_EXECUTOR;',
+      'const args = process.argv.slice(2);',
+      "if (process.env.ADAPTER_STUB_CWD_PATH) fs.writeFileSync(process.env.ADAPTER_STUB_CWD_PATH, process.cwd(), 'utf8');",
+      "if (process.env.ADAPTER_STUB_ARGS_PATH) fs.writeFileSync(process.env.ADAPTER_STUB_ARGS_PATH, JSON.stringify(args), 'utf8');",
+      `const answer = ${JSON.stringify(JSON.stringify(structuredStubAnswer()))};`,
+      "if (executor === 'codex') {",
+      "  const outputIndex = args.indexOf('--output-last-message');",
+      "  if (outputIndex >= 0) fs.writeFileSync(args[outputIndex + 1], answer, 'utf8');",
+      "  process.stdout.write(JSON.stringify({ type: 'done' }) + '\\n');",
+      "} else if (executor === 'gemini') {",
+      '  process.stdout.write(JSON.stringify({ response: answer }));',
+      "} else if (executor === 'opencode') {",
+      "  process.stdout.write(JSON.stringify({ type: 'text', part: { type: 'text', text: answer } }) + '\\n');",
+      '} else {',
+      "  process.stderr.write('unknown adapter stub executor');",
+      '  process.exitCode = 2;',
+      '}'
+    ].join('\n'),
+    'utf8'
+  );
+  return {
+    stubDir,
+    env: {
+      ...process.env,
+      [`CONTEXTBENCH_${executor.toUpperCase()}_COMMAND`]: JSON.stringify([
+        process.execPath,
+        stubScript
+      ]),
+      ADAPTER_STUB_EXECUTOR: executor,
+      ADAPTER_STUB_CWD_PATH: capture?.cwdPath,
+      ADAPTER_STUB_ARGS_PATH: capture?.argsPath
+    }
+  };
+}
+
+describe('ContextBench Phase 40 schema gate', () => {
+  it('exports the structured answer schema used to constrain live Claude output', () => {
+    expect(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA).toMatchObject({
+      type: 'object',
+      additionalProperties: false,
+      required: [...CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS]
+    });
+    expect(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties?.confidence).toMatchObject({
+      type: 'string',
+      enum: ['low', 'medium', 'high']
+    });
+    expect(CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties?.evidence).toMatchObject({
+      type: 'array'
+    });
+    const evidenceSchema = CONTEXTBENCH_STRUCTURED_ANSWER_JSON_SCHEMA.properties?.evidence;
+    const evidenceItems = Array.isArray(evidenceSchema?.items)
+      ? evidenceSchema.items[0]
+      : evidenceSchema?.items;
+    expect(evidenceItems).toMatchObject({ additionalProperties: false });
+    expect(evidenceItems?.properties?.lineRange).toMatchObject({ additionalProperties: false });
+  });
+
+  it('passes the shared schema through Claude CLI arguments without running a live call', () => {
+    const output = execFileSync(
+      'node',
+      ['scripts/contextbench-runner.mjs', '--print-claude-args', '--model', 'haiku'],
+      { encoding: 'utf8' }
+    );
+    const args = JSON.parse(output) as string[];
+    const schemaIndex = args.indexOf('--json-schema');
+    expect(args).toEqual(
+      expect.arrayContaining([
+        '--print',
+        '--output-format',
+        'json',
+        '--model',
+        'haiku',
+        '--json-schema'
+      ])
+    );
+    expect(schemaIndex).toBeGreaterThan(-1);
+    const schema = JSON.parse(args[schemaIndex + 1] ?? '{}') as {
+      required?: string[];
+      properties?: Record<string, unknown>;
+    };
+    expect(schema.required).toEqual([...CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS]);
+    expect(schema.properties).toHaveProperty('readyToEdit');
+  });
+
+  it('keeps invalid structured output terminal instead of repairing prose into success', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        {
+          encoding: 'utf8'
+        }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'fake',
+          '--fake-answer-mode',
+          'invalid_schema',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1',
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        {
+          encoding: 'utf8'
+        }
+      );
+
+      const row = readRows(sessionRoot).find(
+        (candidate) =>
+          candidate.status === 'invalid_schema' && candidate.scoring.claimBearing === false
+      );
+      expect(row).toBeTruthy();
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        structuredAnswerParseErrors: string[];
+      };
+      expect(rawTrace.structuredAnswerParseErrors).toContain('invalid_json');
+      const fallbackAnswer = JSON.parse(
+        readFileSync(row?.structured_answer_path ?? '', 'utf8')
+      ) as {
+        unsupportedClaims: string[];
+      };
+      expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('accepts Claude JSON envelope structured_output without a paid live call', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createGitCheckout();
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const payloadPath = path.join(payloadDir, 'task-payloads.json');
+    const cwdCapturePath = path.join(payloadDir, 'claude-cwd.txt');
+    const stdinCapturePath = path.join(payloadDir, 'claude-stdin.txt');
+    const answer = {
+      answer: 'ok',
+      confidence: 'medium',
+      evidence: [{ file: 'src/a.ts', lineRange: { start: 1, end: 1 }, reason: 'stubbed evidence' }],
+      filesReferenced: ['src/a.ts'],
+      symbolsReferenced: [],
+      unsupportedClaims: [],
+      readyToEdit: false
+    };
+    writeTaskPayloads(payloadPath, taskId, {
+      problem_statement: 'Fix the failing ContextBench task without using hidden gold context.',
+      repo_checkout_path: repoPath
+    });
+    const { stubDir, env } = createClaudeStub(
+      JSON.stringify({
+        type: 'result',
+        subtype: 'success',
+        is_error: false,
+        structured_output: answer
+      }),
+      { cwdPath: cwdCapturePath, stdinPath: stdinCapturePath }
+    );
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        {
+          encoding: 'utf8'
+        }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1',
+          '--task-payloads',
+          payloadPath,
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8', env }
+      );
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        {
+          encoding: 'utf8'
+        }
+      );
+      const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude'));
+      expect(row).toMatchObject({ status: 'completed' });
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        structuredAnswerParseErrors: string[];
+        claudeArgs: string[];
+      };
+      expect(rawTrace.structuredAnswerParseErrors).toEqual([]);
+      expect(rawTrace.claudeArgs).toEqual(expect.arrayContaining(['--output-format', 'json']));
+      expect(rawTrace.workingDirectory).toBe(repoPath);
+      expect(rawTrace.taskContext).toMatchObject({
+        materialized: true,
+        repoCheckoutPath: repoPath,
+        verificationStrict: false
+      });
+      expect(readFileSync(cwdCapturePath, 'utf8')).toBe(repoPath);
+      const stdin = readFileSync(stdinCapturePath, 'utf8');
+      expect(stdin).toContain('Problem statement:');
+      expect(stdin).toContain('Fix the failing ContextBench task');
+      expect(stdin).not.toContain('dataset_field:problem_statement');
+      const structuredAnswer = JSON.parse(
+        readFileSync(row?.structured_answer_path ?? '', 'utf8')
+      ) as {
+        answer: string;
+      };
+      expect(structuredAnswer.answer).toBe('ok');
+      const trajectory = JSON.parse(readFileSync(row?.trajectory_path ?? '', 'utf8')) as {
+        traj_data: { pred_files: string[] };
+      };
+      expect(trajectory.traj_data.pred_files).toContain('src/a.ts');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(repoPath, { recursive: true, force: true });
+      rmSync(payloadDir, { recursive: true, force: true });
+      rmSync(stubDir, { recursive: true, force: true });
+    }
+  });
+
+  it('rejects Claude structured_output with fields outside the frozen schema', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createGitCheckout();
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const payloadPath = path.join(payloadDir, 'task-payloads.json');
+    const answer = { ...structuredStubAnswer(), unexpectedRoot: true };
+    writeTaskPayloads(payloadPath, taskId, {
+      problem_statement: 'Reject schema drift from the executor output.',
+      repo_checkout_path: repoPath
+    });
+    const { stubDir, env } = createClaudeStub(
+      JSON.stringify({
+        type: 'result',
+        subtype: 'success',
+        is_error: false,
+        structured_output: answer
+      })
+    );
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1',
+          '--task-payloads',
+          payloadPath,
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8', env }
+      );
+      const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude'));
+      expect(row).toMatchObject({ status: 'invalid_schema' });
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        structuredAnswerParseErrors: string[];
+      };
+      expect(rawTrace.structuredAnswerParseErrors).toEqual(
+        expect.arrayContaining(['additional_root_field_unexpectedRoot'])
+      );
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(repoPath, { recursive: true, force: true });
+      rmSync(payloadDir, { recursive: true, force: true });
+      rmSync(stubDir, { recursive: true, force: true });
+    }
+  });
+
+  it('blocks a real executor slot before spawn when task payloads are missing', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const taskId = manifest.tasks[0].instance_id;
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1',
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude'));
+      expect(row).toMatchObject({ status: 'task_setup_failed' });
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        exitStatus: number | null;
+        taskContext: { errors: string[]; materialized: boolean };
+      };
+      expect(rawTrace.exitStatus).toBeNull();
+      expect(rawTrace.taskContext.materialized).toBe(false);
+      expect(rawTrace.taskContext.errors).toEqual(
+        expect.arrayContaining([
+          'missing_task_payload',
+          'missing_problem_statement',
+          'missing_repo_checkout_path'
+        ])
+      );
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('runs Codex, Gemini, and OpenCode adapters through the materialized task gate without paid calls', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createGitCheckout();
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const payloadPath = path.join(payloadDir, 'adapter-task-payloads.json');
+    const stubs: string[] = [];
+    try {
+      writeTaskPayloads(payloadPath, taskId, {
+        problem_statement: 'Fix the adapter smoke task with materialized input.',
+        repo_checkout_path: repoPath
+      });
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      const executors = ['codex', 'gemini', 'opencode'] as const;
+      for (const [index, executor] of executors.entries()) {
+        const cwdPath = path.join(payloadDir, `${executor}-cwd.txt`);
+        const argsPath = path.join(payloadDir, `${executor}-args.json`);
+        const { stubDir, env } = createAdapterStub(executor, { cwdPath, argsPath });
+        stubs.push(stubDir);
+        execFileSync(
+          'node',
+          [
+            'scripts/contextbench-runner.mjs',
+            '--baseline-run',
+            '--session',
+            sessionRoot,
+            '--executor',
+            executor,
+            '--model',
+            'stub',
+            '--lane',
+            'raw-native',
+            '--task-id',
+            taskId,
+            '--repeat',
+            String(index + 1),
+            '--task-payloads',
+            payloadPath,
+            '--max-attempts',
+            '1',
+            '--timeout-ms',
+            '60000'
+          ],
+          { encoding: 'utf8', env }
+        );
+        expect(readFileSync(cwdPath, 'utf8')).toBe(repoPath);
+      }
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      const rows = readRows(sessionRoot).filter((row) =>
+        executors.some((executor) => row.run_id.endsWith(`-${executor}`))
+      );
+      expect(rows).toHaveLength(3);
+      for (const row of rows) {
+        expect(row.status).toBe('completed');
+        const rawTrace = JSON.parse(readFileSync(row.raw_trace_path, 'utf8')) as {
+          executor: string;
+          executorSchemaMode: string;
+          executorArgs: string[];
+          taskContext: { materialized: boolean; verificationStrict: boolean };
+          structuredAnswerParseErrors: string[];
+        };
+        expect(rawTrace.taskContext).toMatchObject({
+          materialized: true,
+          verificationStrict: false
+        });
+        expect(rawTrace.structuredAnswerParseErrors).toEqual([]);
+        if (rawTrace.executor === 'codex') {
+          expect(rawTrace.executorSchemaMode).toBe('native_schema');
+          expect(rawTrace.executorArgs).toEqual(expect.arrayContaining(['--output-schema']));
+        } else {
+          expect(rawTrace.executorSchemaMode).toBe('prompt_only');
+        }
+      }
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(repoPath, { recursive: true, force: true });
+      rmSync(payloadDir, { recursive: true, force: true });
+      for (const stubDir of stubs) rmSync(stubDir, { recursive: true, force: true });
+    }
+  });
+
+  it('runs diagnostic codebase-context arms through the materialized task gate', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createGitCheckout();
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const payloadPath = path.join(payloadDir, 'arm-task-payloads.json');
+    const cwdCapturePath = path.join(payloadDir, 'arm-claude-cwd.txt');
+    const stdinCapturePath = path.join(payloadDir, 'arm-claude-stdin.txt');
+    const { stubDir, env } = createClaudeStub(
+      JSON.stringify({
+        type: 'result',
+        subtype: 'success',
+        is_error: false,
+        structured_output: structuredStubAnswer()
+      }),
+      { cwdPath: cwdCapturePath, stdinPath: stdinCapturePath }
+    );
+    try {
+      writeTaskPayloads(payloadPath, taskId, {
+        problem_statement: 'Run the diagnostic arm with materialized task text.',
+        repo_checkout_path: repoPath
+      });
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run-codebase-context-arms',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--task-id',
+          taskId,
+          '--repeats',
+          '1',
+          '--max-attempts',
+          '1',
+          '--task-payloads',
+          payloadPath
+        ],
+        { encoding: 'utf8', env }
+      );
+      const row = readRows(sessionRoot).find(
+        (candidate) => candidate.scoring && 'baselineArmId' in candidate.scoring
+      );
+      expect(row).toMatchObject({ status: 'completed' });
+      expect(readFileSync(cwdCapturePath, 'utf8')).toBe(repoPath);
+      const stdin = readFileSync(stdinCapturePath, 'utf8');
+      expect(stdin).toContain('Problem statement:');
+      expect(stdin).toContain('Run the diagnostic arm with materialized task text.');
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        taskContext: { materialized: boolean; repoCheckoutPath: string };
+      };
+      expect(rawTrace.taskContext).toMatchObject({
+        materialized: true,
+        repoCheckoutPath: repoPath
+      });
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(repoPath, { recursive: true, force: true });
+      rmSync(payloadDir, { recursive: true, force: true });
+      rmSync(stubDir, { recursive: true, force: true });
+    }
+  });
+
+  it('blocks a real executor slot before spawn when repo checkout is missing or at the wrong commit', () => {
+    const missingSessionRoot = tempSessionRoot();
+    const wrongCommitSessionRoot = tempSessionRoot();
+    const task = manifest.tasks[0];
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const missingPayloadPath = path.join(payloadDir, 'missing-repo.json');
+    const wrongCommitPayloadPath = path.join(payloadDir, 'wrong-commit.json');
+    const wrongCommitRepo = createGitCheckout();
+    try {
+      writeTaskPayloads(missingPayloadPath, task.instance_id, {
+        problem_statement: 'Problem text exists but the checkout does not.',
+        repo_checkout_path: path.join(payloadDir, 'does-not-exist')
+      });
+      writeTaskPayloads(wrongCommitPayloadPath, task.instance_id, {
+        problem_statement: 'Problem text exists but the checkout commit is wrong.',
+        repo_checkout_path: wrongCommitRepo
+      });
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', missingSessionRoot],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          missingSessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          task.instance_id,
+          '--repeat',
+          '1',
+          '--task-payloads',
+          missingPayloadPath,
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const missingRow = readRows(missingSessionRoot).find((candidate) =>
+        candidate.run_id.endsWith('-claude')
+      );
+      const missingTrace = JSON.parse(readFileSync(missingRow?.raw_trace_path ?? '', 'utf8')) as {
+        taskContext: { errors: string[] };
+      };
+      expect(missingRow).toMatchObject({ status: 'task_setup_failed' });
+      expect(missingTrace.taskContext.errors).toContain('repo_checkout_missing');
+
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', wrongCommitSessionRoot],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          wrongCommitSessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          task.instance_id,
+          '--repeat',
+          '1',
+          '--task-payloads',
+          wrongCommitPayloadPath,
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const wrongCommitRow = readRows(wrongCommitSessionRoot).find((candidate) =>
+        candidate.run_id.endsWith('-claude')
+      );
+      const wrongCommitTrace = JSON.parse(
+        readFileSync(wrongCommitRow?.raw_trace_path ?? '', 'utf8')
+      ) as {
+        taskContext: { errors: string[]; verificationStrict: boolean };
+      };
+      expect(wrongCommitRow).toMatchObject({ status: 'task_setup_failed' });
+      expect(wrongCommitTrace.taskContext.verificationStrict).toBe(true);
+      expect(wrongCommitTrace.taskContext.errors).toEqual(
+        expect.arrayContaining(['base_commit_mismatch', 'problem_statement_hash_mismatch'])
+      );
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(missingSessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(wrongCommitSessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(payloadDir, { recursive: true, force: true });
+      rmSync(wrongCommitRepo, { recursive: true, force: true });
+    }
+  });
+
+  it('blocks a real executor slot before spawn when the repo checkout is dirty', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    const task = manifest.tasks[0];
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const payloadPath = path.join(payloadDir, 'dirty-repo.json');
+    const dirtyRepo = createGitCheckout();
+    try {
+      writeFileSync(path.join(dirtyRepo, 'dirty.txt'), 'dirty checkout', 'utf8');
+      writeTaskPayloads(payloadPath, task.instance_id, {
+        problem_statement: 'Problem text exists but the checkout has local changes.',
+        repo_checkout_path: dirtyRepo
+      });
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          task.instance_id,
+          '--repeat',
+          '1',
+          '--task-payloads',
+          payloadPath,
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8' }
+      );
+      const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude'));
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        taskContext: { errors: string[]; statusShort: string };
+      };
+      expect(row).toMatchObject({ status: 'task_setup_failed' });
+      expect(rawTrace.taskContext.errors).toContain('repo_checkout_dirty');
+      expect(rawTrace.taskContext.statusShort).toContain('dirty.txt');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(payloadDir, { recursive: true, force: true });
+      rmSync(dirtyRepo, { recursive: true, force: true });
+    }
+  });
+
+  it('records Claude CLI rate limits as tool errors, not answer schema failures', () => {
+    const sessionRoot = tempSessionRoot();
+    const taskId = manifest.tasks[0].instance_id;
+    const repoPath = createGitCheckout();
+    const payloadDir = mkdtempSync(path.join(tmpdir(), 'contextbench-task-payloads-'));
+    const payloadPath = path.join(payloadDir, 'task-payloads-rate-limit.json');
+    writeTaskPayloads(payloadPath, taskId, {
+      problem_statement: 'Fix the task; this test exercises rate-limit classification.',
+      repo_checkout_path: repoPath
+    });
+    const { stubDir, env } = createClaudeStub(
+      "You've hit your limit · resets 8pm (Europe/Madrid)\n"
+    );
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        {
+          encoding: 'utf8'
+        }
+      );
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--baseline-run',
+          '--session',
+          sessionRoot,
+          '--executor',
+          'claude',
+          '--model',
+          'haiku',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          taskId,
+          '--repeat',
+          '1',
+          '--task-payloads',
+          payloadPath,
+          '--max-attempts',
+          '1'
+        ],
+        { encoding: 'utf8', env }
+      );
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        {
+          encoding: 'utf8'
+        }
+      );
+      const row = readRows(sessionRoot).find((candidate) => candidate.run_id.endsWith('-claude'));
+      expect(row).toMatchObject({ status: 'tool_error' });
+      const rawTrace = JSON.parse(readFileSync(row?.raw_trace_path ?? '', 'utf8')) as {
+        claudeDiagnostic: string;
+        structuredAnswerParseErrors: string[];
+      };
+      expect(rawTrace.claudeDiagnostic).toBe('claude_rate_limit');
+      expect(rawTrace.structuredAnswerParseErrors).toEqual(['invalid_json', 'claude_rate_limit']);
+      const fallbackAnswer = JSON.parse(
+        readFileSync(row?.structured_answer_path ?? '', 'utf8')
+      ) as {
+        unsupportedClaims: string[];
+      };
+      expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+      rmSync(repoPath, { recursive: true, force: true });
+      rmSync(payloadDir, { recursive: true, force: true });
+      rmSync(stubDir, { recursive: true, force: true });
+    }
+  });
+
+  it('uses the same required fields for parser validation', () => {
+    const invalid = Object.fromEntries(
+      CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS.filter((field) => field !== 'readyToEdit').map(
+        (field) => [field, field === 'evidence' ? [] : field === 'confidence' ? 'medium' : []]
+      )
+    );
+    const parsed = parseStructuredAnswer(JSON.stringify(invalid));
+    expect(parsed).toMatchObject({ status: 'invalid_schema' });
+    expect(parsed.errors).toContain('missing_readyToEdit');
+  });
+});
diff --git a/tests/contextbench-baseline-snapshot.test.ts b/tests/contextbench-baseline-snapshot.test.ts
new file mode 100644
index 0000000..1061826
--- /dev/null
+++ b/tests/contextbench-baseline-snapshot.test.ts
@@ -0,0 +1,133 @@
+import { execFileSync } from 'node:child_process';
+import { mkdtempSync, readFileSync, rmSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import { describe, expect, it, vi } from 'vitest';
+
+type BaselineSession = {
+  claimBearing: boolean;
+  sealed: boolean;
+  sessionHash: string;
+  snapshot: {
+    branch: string;
+    head: string;
+    divergence: { status: string };
+    gitStatusPath: string;
+    trackedDiffPath: string;
+    stagedDiffPath: string;
+    diffStatPath: string;
+    untracked: Array<{ path: string; disposition: string; hash: string | null; exclusionReason: string | null }>;
+    lockfiles: Array<{ path: string; hash: string }>;
+    redactedEnvVarNames: string[];
+    versions: Record<string, string>;
+    fixtureHashes: Record<string, string>;
+    commandTranscript: Array<{ command: string; stdoutPath: string | null; stderrPath: string | null }>;
+    snapshotHash: string;
+  };
+  artifactIndex: Array<{ path: string; hash: string }>;
+};
+
+vi.setConfig({ testTimeout: 30000 });
+
+function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
+  return path.join(
+    mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-`)),
+    'benchmark-runs',
+    'contextbench',
+    phase,
+    'snapshot-smoke'
+  );
+}
+
+describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
+  it('captures the current checkout before baseline runs with hashes and validation metadata', () => {
+    const sessionRoot = tempSessionRoot();
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      const validateOutput = execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      expect(validateOutput).toContain('baseline session validation passed');
+
+      const session = JSON.parse(
+        readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8')
+      ) as BaselineSession & { phase: number };
+      expect(session.phase).toBe(40);
+      expect(session.claimBearing).toBe(false);
+      expect(session.sealed).toBe(false);
+      expect(session.sessionHash).toMatch(/^sha256:[a-f0-9]{64}$/);
+      expect(session.snapshot.branch.length).toBeGreaterThan(0);
+      expect(session.snapshot.head).toMatch(/^[a-f0-9]{40}$/);
+      expect(session.snapshot.divergence.status).toBe('unavailable');
+      expect(session.snapshot.snapshotHash).toMatch(/^sha256:[a-f0-9]{64}$/);
+      expect(session.snapshot.gitStatusPath).toBe('snapshot/git/status-porcelain-v2.txt');
+      expect(session.snapshot.trackedDiffPath).toBe('snapshot/git/tracked.diff');
+      expect(session.snapshot.stagedDiffPath).toBe('snapshot/git/staged.diff');
+      expect(session.snapshot.diffStatPath).toBe('snapshot/git/diff-stat.txt');
+      expect(session.snapshot.lockfiles.map((entry) => entry.path)).toContain(
+        path.relative(sessionRoot, path.resolve('pnpm-lock.yaml')).replace(/\\/g, '/')
+      );
+      expect(session.snapshot.fixtureHashes.protocol).toMatch(/^sha256:[a-f0-9]{64}$/);
+      expect(session.snapshot.commandTranscript.map((entry) => entry.command)).toEqual(
+        expect.arrayContaining(['git status --porcelain=v2 --branch --untracked-files=all', 'git diff --no-ext-diff'])
+      );
+      expect(session.artifactIndex.map((entry) => entry.path)).toEqual(
+        expect.arrayContaining(['slot-reservations.json', 'run-manifest.jsonl'])
+      );
+      expect(JSON.stringify(session)).not.toContain(process.env.OPENAI_API_KEY ?? 'definitely-not-present');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('captures Phase 41 baseline snapshots with Phase 41 metadata', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      const validateOutput = execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      expect(validateOutput).toContain('baseline session validation passed');
+
+      const session = JSON.parse(
+        readFileSync(path.join(sessionRoot, 'BASELINE-SESSION.json'), 'utf8')
+      ) as BaselineSession & { phase: number };
+      expect(session.phase).toBe(41);
+      expect(session.sessionRoot).toContain('/phase41/');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
+  it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => {
+    const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-'));
+    try {
+      expect(() =>
+        execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', outDir], {
+          encoding: 'utf8',
+          stdio: 'pipe'
+        })
+      ).toThrow(/benchmark-runs\/contextbench\/phase40/);
+    } finally {
+      rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+});
diff --git a/tests/contextbench-lane-setup.test.ts b/tests/contextbench-lane-setup.test.ts
new file mode 100644
index 0000000..34290e0
--- /dev/null
+++ b/tests/contextbench-lane-setup.test.ts
@@ -0,0 +1,156 @@
+import { describe, expect, it } from 'vitest';
+import {
+  CONTEXTBENCH_LANE_READINESS_STATUSES,
+  type ContextBenchLane,
+  type ContextBenchLaneSetupEvidenceFixture,
+  type ContextBenchLaneSetupEvidenceRecord,
+  type ContextBenchLaneToolCard
+} from '../src/eval/contextbench-types.js';
+import { hashSetupEvidenceRecord } from '../src/eval/contextbench-artifacts.js';
+import laneSetupEvidenceFixture from './fixtures/contextbench-lane-setup-evidence.json';
+import laneToolCardsFixture from './fixtures/contextbench-lane-tool-cards.json';
+import lanesFixture from './fixtures/contextbench-lanes.json';
+import packageJson from '../package.json';
+import protocolFixture from './fixtures/contextbench-benchmark-protocol.json';
+
+type LanesFixture = {
+  broadClaimLaneSet: string[];
+  lanes: ContextBenchLane[];
+  setupFailureSemantics: {
+    winEligible: boolean;
+    claimContribution: string;
+    includedInPublicationRows: boolean;
+    blocksBroadClaimsForRequiredLane: boolean;
+  };
+};
+
+type LaneToolCardsFixture = {
+  cards: ContextBenchLaneToolCard[];
+};
+
+type PackageFixture = {
+  dependencies?: Record<string, string>;
+  devDependencies?: Record<string, string>;
+};
+
+const lanes = lanesFixture as LanesFixture;
+const laneToolCards = laneToolCardsFixture as LaneToolCardsFixture;
+const setupEvidence = laneSetupEvidenceFixture as ContextBenchLaneSetupEvidenceFixture;
+const packageFixture = packageJson as PackageFixture;
+const blockedStatuses = new Set(['setup_failed', 'index_failed', 'tool_error', 'invasive_setup_blocked']);
+
+function byLane<T extends { laneId: string }>(items: T[]): Map<string, T> {
+  return new Map(items.map((item) => [item.laneId, item]));
+}
+
+function hasPendingPlaceholder(card: ContextBenchLaneToolCard): boolean {
+  return [card.setupCommand, card.indexCommand, card.queryCommand, card.versionCommand].some((command) =>
+    command.toLowerCase().includes('pending phase 39')
+  );
+}
+
+function expectTerminalBlockedRecord(record: ContextBenchLaneSetupEvidenceRecord): void {
+  expect(blockedStatuses.has(record.readinessStatus)).toBe(true);
+  expect(record.logReference).toMatch(/^outputs\/contextbench\/setup\//);
+  expect(record.nextHumanAction.length).toBeGreaterThan(20);
+  expect(record.commands.some((command) => command.status === 'blocked' || command.status === 'failed')).toBe(true);
+  expect(record.commands.some((command) => command.stdoutLogPath || command.stderrLogPath || command.outputHash)).toBe(true);
+}
+
+describe('ContextBench Phase 39 lane setup evidence', () => {
+  it('covers every required lane with a final non-pending readiness record', () => {
+    const evidenceByLane = byLane(setupEvidence.records);
+    const cardsByLane = byLane(laneToolCards.cards);
+
+    for (const laneId of lanes.broadClaimLaneSet) {
+      const record = evidenceByLane.get(laneId);
+      const card = cardsByLane.get(laneId);
+      expect(record, `missing setup evidence for ${laneId}`).toBeTruthy();
+      expect(card, `missing lane card for ${laneId}`).toBeTruthy();
+      if (!record || !card) continue;
+      expect(record.readinessStatus).not.toBe('pending');
+      expect(CONTEXTBENCH_LANE_READINESS_STATUSES).toContain(record.readinessStatus);
+      expect(card.phase39Status).toBe(record.readinessStatus);
+      expect(record.claimBearing).toBe(false);
+      expect(record.commands.map((command) => command.kind).sort()).toEqual([
+        'index',
+        'query',
+        'setup',
+        'version'
+      ]);
+    }
+  });
+
+  it('rejects unresolved Phase 39 placeholders unless there is terminal blocker evidence', () => {
+    const evidenceByLane = byLane(setupEvidence.records);
+    for (const card of laneToolCards.cards) {
+      const record = evidenceByLane.get(card.laneId);
+      expect(record).toBeTruthy();
+      if (!record) continue;
+      if (hasPendingPlaceholder(card)) {
+        expectTerminalBlockedRecord(record);
+      }
+      expect(hasPendingPlaceholder(card)).toBe(false);
+    }
+  });
+
+  it('keeps setup/index cost and status separate from task execution metadata', () => {
+    for (const record of setupEvidence.records) {
+      expect(record.setupStatus).toBeTruthy();
+      expect(record.indexStatus).toBeTruthy();
+      expect(record).not.toHaveProperty('taskWallTimeMs');
+      expect(record.commands.every((command) => command.durationMs === null || command.durationMs >= 0)).toBe(true);
+      expect(record.setupDurationMs === null || record.setupDurationMs >= 0).toBe(true);
+      expect(record.indexDurationMs === null || record.indexDurationMs >= 0).toBe(true);
+    }
+  });
+
+  it('records blocked and failed lanes as terminal missing evidence, not wins', () => {
+    expect(lanes.setupFailureSemantics.winEligible).toBe(false);
+    expect(lanes.setupFailureSemantics.claimContribution).toBe('missing_evidence');
+    expect(lanes.setupFailureSemantics.includedInPublicationRows).toBe(true);
+    expect(lanes.setupFailureSemantics.blocksBroadClaimsForRequiredLane).toBe(true);
+
+    const blockedRecords = setupEvidence.records.filter((record) => blockedStatuses.has(record.readinessStatus));
+    expect(blockedRecords.map((record) => record.laneId).sort()).toEqual([
+      'codebase-memory-mcp',
+      'grepai'
+    ]);
+    for (const record of blockedRecords) expectTerminalBlockedRecord(record);
+  });
+
+  it('preserves one-context-tool isolation for non-raw lanes', () => {
+    const cardsByLane = byLane(laneToolCards.cards);
+    for (const lane of lanes.lanes) {
+      const card = cardsByLane.get(lane.laneId);
+      expect(card).toBeTruthy();
+      if (!card || lane.laneId === 'raw-native') continue;
+      expect(card.contextTools).toEqual([lane.contextTool]);
+      expect(card.allowedTools).toEqual([lane.contextTool]);
+      expect(card.disallowedTools).toEqual(expect.arrayContaining(['native-read', 'native-search', 'native-shell-readonly']));
+    }
+  });
+
+  it('keeps competitor tools out of package runtime dependencies', () => {
+    const runtimeDependencies = Object.keys(packageFixture.dependencies ?? {});
+    const devDependencies = Object.keys(packageFixture.devDependencies ?? {});
+    const forbiddenPackages = ['jcodemunch-mcp', 'grepai', 'codebase-memory-mcp', 'codegraphcontext', 'kuzu'];
+    for (const dependencyName of [...runtimeDependencies, ...devDependencies]) {
+      expect(forbiddenPackages).not.toContain(dependencyName.toLowerCase());
+    }
+  });
+
+  it('keeps Phase 39 setup/probe evidence non-claim-bearing', () => {
+    expect(protocolFixture.claimAllowed).toBe(false);
+    expect(setupEvidence.claimBearing).toBe(false);
+    expect(setupEvidence.generatedOutputsPolicy).toContain('not Phase 40 baseline artifacts');
+    expect(setupEvidence.records.every((record) => record.claimBearing === false)).toBe(true);
+  });
+
+  it('can hash setup evidence records without using fixture mutation as proof', () => {
+    for (const record of setupEvidence.records) {
+      expect(record.evidenceHash).toBeTruthy();
+      expect(hashSetupEvidenceRecord(record)).toMatch(/^sha256:[a-f0-9]{64}$/);
+    }
+  });
+});
diff --git a/tests/contextbench-phase42-evidence-gate.test.ts b/tests/contextbench-phase42-evidence-gate.test.ts
new file mode 100644
index 0000000..96ee7d2
--- /dev/null
+++ b/tests/contextbench-phase42-evidence-gate.test.ts
@@ -0,0 +1,372 @@
+import { describe, expect, it } from 'vitest';
+import {
+  evaluateContextBenchEvidenceGate,
+  type ContextBenchEvidenceGateInput,
+  type ContextBenchRunEvidenceArtifacts
+} from '../src/eval/contextbench-evidence-gate.js';
+import type { ContextBenchRunManifestRow } from '../src/eval/contextbench-types.js';
+
+const runnerHash = 'sha256:1111111111111111111111111111111111111111111111111111111111111111';
+const protocolHash = 'sha256:protocol';
+const taskManifestHash = 'sha256:manifest';
+const scoreHash = 'sha256:cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc';
+const officialOutputHash = 'sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+const stdoutHash = 'sha256:dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd';
+const stderrHash = 'sha256:eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee';
+
+function baseRow(overrides: Partial<ContextBenchRunManifestRow> = {}): ContextBenchRunManifestRow {
+  return {
+    run_id: 'codebase-context-task-1-1-claude',
+    protocol_version: 'contextbench-protocol-v1',
+    protocol_hash: protocolHash,
+    task_manifest_hash: taskManifestHash,
+    lane_id: 'codebase-context',
+    task_id: 'task-1',
+    repeat_index: 1,
+    status: 'completed',
+    started_at: '2026-04-29T00:00:00.000Z',
+    completed_at: '2026-04-29T00:00:05.000Z',
+    raw_trace_path: 'runs/codebase-context-task-1-1-claude/raw-trace.json',
+    structured_answer_path: 'runs/codebase-context-task-1-1-claude/structured-answer.json',
+    trajectory_path: 'runs/codebase-context-task-1-1-claude/trajectory.json',
+    score_path: 'runs/codebase-context-task-1-1-claude/score.json',
+    setup_index_path: 'runs/codebase-context-task-1-1-claude/setup-index.json',
+    prompt_path: 'runs/codebase-context-task-1-1-claude/prompt.txt',
+    lane_tool_card_path: 'runs/codebase-context-task-1-1-claude/lane-card.json',
+    setupIndex: {
+      setupCommand: 'npx codebase-context index',
+      indexCommand: 'npx codebase-context index',
+      setupDurationMs: 120,
+      indexDurationMs: 340,
+      setupLogPath: 'runs/codebase-context-task-1-1-claude/setup.log',
+      indexLogPath: 'runs/codebase-context-task-1-1-claude/index.log',
+      setupStatus: 'completed',
+      indexStatus: 'completed'
+    },
+    taskExecution: {
+      model: 'claude-sonnet-4-5',
+      timeoutSeconds: 600,
+      maxContextTokens: 120000,
+      maxAnswerTokens: 4000,
+      startedAt: '2026-04-29T00:00:00.000Z',
+      completedAt: '2026-04-29T00:00:05.000Z',
+      taskWallTimeMs: 5000,
+      executor: 'claude'
+    },
+    scoring: {
+      officialEvaluatorFirst: true,
+      officialEvaluatorAttempted: true,
+      officialEvaluatorInvoked: true,
+      command: 'python -m contextbench.evaluate --gold gold.parquet --pred trajectory.json --out official.jsonl',
+      claimBearing: true
+    },
+    hashes: {
+      runnerSourceHash: runnerHash
+    },
+    ...overrides
+  };
+}
+
+function passingArtifacts(overrides: Partial<ContextBenchRunEvidenceArtifacts> = {}): ContextBenchRunEvidenceArtifacts {
+  return {
+    rawTrace: {
+      executor: 'claude',
+      model: 'claude-sonnet-4-5',
+      runnerHash
+    },
+    score: {
+      status: 'completed',
+      mode: 'official_evaluator',
+      claimBearing: true,
+      officialEvaluatorInvoked: true,
+      command: 'python -m contextbench.evaluate --gold gold.parquet --pred trajectory.json --out official.jsonl',
+      exitCode: 0,
+      outputPath: 'runs/codebase-context-task-1-1-claude/official-results.jsonl',
+      outputHash: officialOutputHash,
+      stdoutPath: 'runs/codebase-context-task-1-1-claude/official.stdout.log',
+      stderrPath: 'runs/codebase-context-task-1-1-claude/official.stderr.log'
+    },
+    setupIndex: {
+      setupStatus: 'completed',
+      indexStatus: 'completed',
+      setupDurationMs: 120,
+      indexDurationMs: 340,
+      setupLogPath: 'runs/codebase-context-task-1-1-claude/setup.log',
+      indexLogPath: 'runs/codebase-context-task-1-1-claude/index.log'
+    },
+    laneIsolation: {
+      laneId: 'codebase-context',
+      proven: true,
+      sourceKind: 'proxy',
+      expectedContextTool: 'codebase-context',
+      allowedTools: ['codebase-context'],
+      observedTools: ['codebase-context']
+    },
+    ...overrides
+  };
+}
+
+function passingInput(overrides: Partial<ContextBenchEvidenceGateInput> = {}): ContextBenchEvidenceGateInput {
+  const row = baseRow();
+  return {
+    evidenceMode: 'artifact_verified',
+    protocol: {
+      claimAllowed: true,
+      benchmarkTarget: {
+        officialEvaluatorFirst: true
+      }
+    },
+    requiredLaneIds: ['codebase-context'],
+    requiredTaskIds: ['task-1'],
+    requiredRepeats: 1,
+    expectedTotalRows: 1,
+    expectedProtocolHash: protocolHash,
+    expectedTaskManifestHash: taskManifestHash,
+    lanePoliciesById: {
+      'codebase-context': {
+        laneId: 'codebase-context',
+        expectedContextTool: 'codebase-context',
+        allowedTools: ['codebase-context'],
+        disallowedTools: ['native-read', 'native-search', 'native-shell-readonly']
+      }
+    },
+    rows: [row],
+    artifactsByRunId: {
+      [row.run_id]: passingArtifacts()
+    },
+    artifactHashesByPath: {
+      [row.score_path]: scoreHash,
+      'runs/codebase-context-task-1-1-claude/official-results.jsonl': officialOutputHash,
+      'runs/codebase-context-task-1-1-claude/official.stdout.log': stdoutHash,
+      'runs/codebase-context-task-1-1-claude/official.stderr.log': stderrHash
+    },
+    expectedRunnerHash: runnerHash,
+    currentRunnerHash: runnerHash,
+    ...overrides
+  };
+}
+
+function failureCodes(input: ContextBenchEvidenceGateInput): string[] {
+  return evaluateContextBenchEvidenceGate(input).failures.map((failure) => failure.code);
+}
+
+describe('ContextBench Phase 42 evidence gate', () => {
+  it('allows synthetic shape validation but never treats it as claim-pass', () => {
+    const result = evaluateContextBenchEvidenceGate(
+      passingInput({ evidenceMode: 'synthetic_shape' })
+    );
+    expect(result.shapePass).toBe(true);
+    expect(result.claimPass).toBe(false);
+    expect(result.diagnosticOnly).toBe(true);
+    expect(result.failures.map((failure) => failure.code)).toEqual(['artifact_verification_missing']);
+  });
+
+  it('rejects synthetic evidence when official evaluator invocation is missing', () => {
+    const row = baseRow({
+      scoring: {
+        officialEvaluatorFirst: true,
+        officialEvaluatorAttempted: true,
+        officialEvaluatorInvoked: false,
+        command: 'python -m contextbench.evaluate',
+        claimBearing: false,
+        fallbackReason: 'official_evaluator_not_invoked'
+      }
+    });
+    const input = passingInput({
+      rows: [row],
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          score: {
+            status: 'judge_failed',
+            mode: 'diagnostic_fallback',
+            claimBearing: false,
+            officialEvaluatorInvoked: false,
+            command: 'python -m contextbench.evaluate',
+            exitCode: 1,
+            outputPath: 'official-results.jsonl',
+            outputHash: 'sha256:bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb',
+            stdoutPath: 'official.stdout.log',
+            stderrPath: 'official.stderr.log'
+          }
+        })
+      },
+      artifactHashesByPath: {}
+    });
+    expect(failureCodes(input)).toEqual(
+      expect.arrayContaining(['official_evaluator_missing', 'diagnostic_fallback_only'])
+    );
+  });
+
+  it('rejects synthetic evidence when lane isolation proof is missing', () => {
+    const row = baseRow();
+    const input = passingInput({
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({ laneIsolation: undefined })
+      }
+    });
+    expect(failureCodes(input)).toContain('lane_isolation_missing');
+  });
+
+  it('rejects synthetic evidence when lane isolation telemetry is empty', () => {
+    const row = baseRow();
+    const input = passingInput({
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          laneIsolation: {
+            laneId: 'codebase-context',
+            proven: true,
+            sourceKind: 'proxy',
+            expectedContextTool: 'codebase-context',
+            allowedTools: ['codebase-context'],
+            observedTools: []
+          }
+        })
+      }
+    });
+    expect(failureCodes(input)).toContain('lane_isolation_missing');
+  });
+
+  it('rejects synthetic evidence when ready lane setup/index evidence is missing', () => {
+    const row = baseRow();
+    const input = passingInput({
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          setupIndex: {
+            setupStatus: 'completed',
+            indexStatus: 'completed',
+            setupDurationMs: 0,
+            indexDurationMs: 0,
+            setupLogPath: 'setup.log',
+            indexLogPath: 'index.log'
+          }
+        })
+      }
+    });
+    expect(failureCodes(input)).toContain('setup_index_cost_missing');
+  });
+
+  it('rejects synthetic evidence when runner provenance does not match', () => {
+    const input = passingInput({
+      currentRunnerHash: 'sha256:2222222222222222222222222222222222222222222222222222222222222222'
+    });
+    expect(failureCodes(input)).toContain('runner_provenance_mismatch');
+  });
+
+  it('rejects duplicate and unexpected rows so the denominator cannot be narrowed', () => {
+    const row = baseRow();
+    const duplicate = baseRow({ run_id: 'duplicate-run' });
+    const unexpected = baseRow({ run_id: 'unexpected-run', task_id: 'task-outside-denominator' });
+    const input = passingInput({
+      rows: [row, duplicate, unexpected],
+      expectedTotalRows: 1,
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts(),
+        [duplicate.run_id]: passingArtifacts(),
+        [unexpected.run_id]: passingArtifacts()
+      }
+    });
+    expect(failureCodes(input)).toEqual(
+      expect.arrayContaining(['duplicate_required_run', 'unexpected_run_row'])
+    );
+  });
+
+  it('rejects evidence when row count is narrower than the frozen denominator', () => {
+    const input = passingInput({ expectedTotalRows: 2 });
+    expect(failureCodes(input)).toContain('denominator_count_mismatch');
+  });
+
+  it('rejects setup/index evidence that contradicts the manifest row', () => {
+    const row = baseRow();
+    const input = passingInput({
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          setupIndex: {
+            setupStatus: 'completed',
+            indexStatus: 'completed',
+            setupDurationMs: 999,
+            indexDurationMs: 340,
+            setupLogPath: 'runs/codebase-context-task-1-1-claude/setup.log',
+            indexLogPath: 'runs/codebase-context-task-1-1-claude/index.log'
+          }
+        })
+      }
+    });
+    expect(failureCodes(input)).toContain('setup_index_cost_missing');
+  });
+
+  it('rejects self-attested official evaluator proof without command output artifacts', () => {
+    const row = baseRow();
+    const input = passingInput({
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          score: {
+            status: 'completed',
+            mode: 'official_evaluator',
+            claimBearing: true,
+            officialEvaluatorInvoked: true
+          }
+        })
+      }
+    });
+    expect(failureCodes(input)).toContain('official_evaluator_missing');
+  });
+
+  it('passes artifact-verified evidence with official evaluator, lane isolation, setup/index, and matching runner provenance', () => {
+    const result = evaluateContextBenchEvidenceGate(passingInput());
+    expect(result).toEqual({
+      shapePass: true,
+      claimPass: true,
+      diagnosticOnly: false,
+      failures: []
+    });
+  });
+
+  it('allows raw-native policy to prove multiple native observations without collapsing them into one fake tool', () => {
+    const row = baseRow({ lane_id: 'raw-native', run_id: 'raw-native-task-1-1-claude' });
+    const input = passingInput({
+      requiredLaneIds: ['raw-native'],
+      rows: [row],
+      lanePoliciesById: {
+        'raw-native': {
+          laneId: 'raw-native',
+          expectedContextTool: 'native-agent-tools',
+          allowedTools: ['native-read', 'native-search', 'native-shell-readonly'],
+          disallowedTools: ['codebase-context'],
+          allowMultipleObservedTools: true
+        }
+      },
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          laneIsolation: {
+            laneId: 'raw-native',
+            proven: true,
+            sourceKind: 'proxy',
+            expectedContextTool: 'native-agent-tools',
+            allowedTools: ['native-read', 'native-search', 'native-shell-readonly'],
+            observedTools: ['native-read', 'native-search']
+          }
+        })
+      }
+    });
+    expect(evaluateContextBenchEvidenceGate(input).claimPass).toBe(true);
+  });
+
+  it('rejects env-injected lane telemetry for artifact-verified claim pass', () => {
+    const row = baseRow();
+    const input = passingInput({
+      artifactsByRunId: {
+        [row.run_id]: passingArtifacts({
+          laneIsolation: {
+            laneId: 'codebase-context',
+            proven: true,
+            sourceKind: 'env_override',
+            expectedContextTool: 'codebase-context',
+            allowedTools: ['codebase-context'],
+            observedTools: ['codebase-context']
+          }
+        })
+      }
+    });
+    expect(failureCodes(input)).toContain('lane_isolation_missing');
+  });
+});
diff --git a/tests/contextbench-runner-contract.test.ts b/tests/contextbench-runner-contract.test.ts
new file mode 100644
index 0000000..76455ab
--- /dev/null
+++ b/tests/contextbench-runner-contract.test.ts
@@ -0,0 +1,321 @@
+import { execFileSync } from 'node:child_process';
+import { mkdtempSync, readFileSync, rmSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import { describe, expect, it } from 'vitest';
+import {
+  CONTEXTBENCH_TERMINAL_STATUSES,
+  type ContextBenchLane,
+  type ContextBenchLaneToolCard,
+  type ContextBenchProtocol,
+  type ContextBenchRunManifestRow,
+  type ContextBenchTaskManifest
+} from '../src/eval/contextbench-types.js';
+import {
+  appendManifestRow,
+  buildManifestRow,
+  buildRunId,
+  createArtifactPathSet,
+  hashJson,
+  readManifestRows,
+  writeJsonArtifact
+} from '../src/eval/contextbench-artifacts.js';
+import {
+  classifyStructuredAnswer,
+  parseStructuredAnswer
+} from '../src/eval/contextbench-answer.js';
+import correctionsFixture from './fixtures/contextbench-corrections.json';
+import laneToolCardsFixture from './fixtures/contextbench-lane-tool-cards.json';
+import lanesFixture from './fixtures/contextbench-lanes.json';
+import manifestFixture from './fixtures/contextbench-task-manifest.json';
+import protocolFixture from './fixtures/contextbench-benchmark-protocol.json';
+
+type LaneToolCardsFixture = {
+  protocolVersion: string;
+  cards: ContextBenchLaneToolCard[];
+};
+
+type LanesFixture = {
+  broadClaimLaneSet: string[];
+  lanes: ContextBenchLane[];
+  laneToolCardRequiredFields: string[];
+};
+
+const protocol = protocolFixture as ContextBenchProtocol;
+const manifest = manifestFixture as ContextBenchTaskManifest;
+const lanes = lanesFixture as LanesFixture;
+const laneToolCards = laneToolCardsFixture as LaneToolCardsFixture;
+const corrections = correctionsFixture as {
+  policy: { anyFixtureChangeRequiresCorrection: boolean };
+};
+
+function tempDir(): string {
+  return mkdtempSync(path.join(tmpdir(), 'contextbench-runner-'));
+}
+
+describe('ContextBench Phase 38 runner contract', () => {
+  it('keeps frozen task/protocol inputs read-only and correction-governed', () => {
+    expect(protocol.claimAllowed).toBe(false);
+    expect(protocol.benchmarkTarget.officialEvaluatorFirst).toBe(true);
+    expect(manifest.tasks).toHaveLength(20);
+    expect(manifest.manifest_hash).toMatch(/^sha256:[a-f0-9]{64}$/);
+    expect(manifest.no_lane_outputs_observed_attestation).toContain('No raw/native');
+    expect(corrections.policy.anyFixtureChangeRequiresCorrection).toBe(true);
+    expect(laneToolCards.protocolVersion).toBe(protocol.protocolVersion);
+  });
+
+  it('defines explicit lane cards for every required lane while only raw/native and codebase-context are Phase 38 executable', () => {
+    const cardsByLane = new Map(laneToolCards.cards.map((card) => [card.laneId, card]));
+    for (const laneId of lanes.broadClaimLaneSet) {
+      expect(cardsByLane.has(laneId)).toBe(true);
+    }
+    expect(cardsByLane.get('raw-native')?.executableInPhase38).toBe(true);
+    expect(cardsByLane.get('codebase-context')?.executableInPhase38).toBe(true);
+    expect(cardsByLane.get('jcodemunch-repomapper')?.phase38Status).toBe('pending_phase39_setup');
+    expect(cardsByLane.get('grepai')?.executableInPhase38).toBe(false);
+  });
+
+  it('enforces one-context-tool semantics and setup/index cost separation through lane cards', () => {
+    for (const lane of lanes.lanes) {
+      const card = laneToolCards.cards.find((candidate) => candidate.laneId === lane.laneId);
+      expect(card).toBeTruthy();
+      if (!card) continue;
+      for (const field of lanes.laneToolCardRequiredFields) {
+        expect(card[field as keyof ContextBenchLaneToolCard]).toBeTruthy();
+      }
+      expect(card.setupCostReportedSeparately).toBe(true);
+      expect(card.indexCostReportedSeparately).toBe(true);
+      expect(card.disallowedTools).not.toContain(lane.contextTool);
+      if (lane.laneId === 'raw-native') {
+        expect(card.contextTools).toEqual(['native-agent-tools']);
+      } else {
+        expect(card.contextTools).toEqual([lane.contextTool]);
+        expect(card.allowedTools).toEqual([lane.contextTool]);
+      }
+    }
+  });
+
+  it('keeps every protocol terminal status represented in the typed contract', () => {
+    expect(CONTEXTBENCH_TERMINAL_STATUSES).toEqual(protocol.runManifestSchema.terminalStatuses);
+    expect(CONTEXTBENCH_TERMINAL_STATUSES).toEqual(
+      expect.arrayContaining([
+        'setup_failed',
+        'index_failed',
+        'invalid_schema',
+        'false_ready',
+        'judge_failed'
+      ])
+    );
+  });
+
+  it('validates structured answers and maps malformed answers to invalid_schema', () => {
+    expect(parseStructuredAnswer('not json')).toMatchObject({ status: 'invalid_schema' });
+    expect(parseStructuredAnswer(JSON.stringify({ answer: 'missing fields' }))).toMatchObject({
+      status: 'invalid_schema'
+    });
+    const parsed = parseStructuredAnswer(
+      JSON.stringify({
+        answer: 'ready',
+        confidence: 'medium',
+        evidence: [
+          { file: 'src/a.ts', lineRange: { start: 1, end: 2 }, reason: 'direct evidence' }
+        ],
+        filesReferenced: ['src/a.ts'],
+        symbolsReferenced: [],
+        unsupportedClaims: [],
+        readyToEdit: false
+      })
+    );
+    expect(parsed.status).toBe('valid');
+  });
+
+  it('rejects structured answer fields outside the frozen schema', () => {
+    const validAnswer = {
+      answer: 'ready',
+      confidence: 'medium',
+      evidence: [
+        { file: 'src/a.ts', lineRange: { start: 1, end: 2 }, reason: 'direct evidence' }
+      ],
+      filesReferenced: ['src/a.ts'],
+      symbolsReferenced: [],
+      unsupportedClaims: [],
+      readyToEdit: false
+    };
+
+    expect(parseStructuredAnswer(JSON.stringify({ ...validAnswer, extra: true }))).toMatchObject({
+      status: 'invalid_schema',
+      errors: expect.arrayContaining(['additional_root_field_extra'])
+    });
+    expect(
+      parseStructuredAnswer(
+        JSON.stringify({
+          ...validAnswer,
+          evidence: [{ ...validAnswer.evidence[0], extraEvidence: true }]
+        })
+      )
+    ).toMatchObject({
+      status: 'invalid_schema',
+      errors: expect.arrayContaining(['additional_evidence_field_extraEvidence'])
+    });
+    expect(
+      parseStructuredAnswer(
+        JSON.stringify({
+          ...validAnswer,
+          evidence: [
+            {
+              ...validAnswer.evidence[0],
+              lineRange: { ...validAnswer.evidence[0].lineRange, extraLine: true }
+            }
+          ]
+        })
+      )
+    ).toMatchObject({
+      status: 'invalid_schema',
+      errors: expect.arrayContaining(['additional_line_range_field_extraLine'])
+    });
+  });
+
+  it('classifies false-ready from deterministic diagnostics, not just model self-report', () => {
+    const parsed = parseStructuredAnswer(
+      JSON.stringify({
+        answer: 'safe to edit',
+        confidence: 'high',
+        evidence: [
+          { file: 'src/a.ts', lineRange: { start: 1, end: 2 }, reason: 'partial evidence' }
+        ],
+        filesReferenced: ['src/a.ts'],
+        symbolsReferenced: [],
+        unsupportedClaims: [],
+        readyToEdit: true
+      })
+    );
+    expect(parsed.answer).not.toBeNull();
+    if (!parsed.answer) return;
+    const classification = classifyStructuredAnswer(parsed.answer, {
+      missingRequiredFacts: ['required fact absent'],
+      missingEvidenceFiles: ['src/required.ts']
+    });
+    expect(classification.unsupportedClaim).toBe(true);
+    expect(classification.falseReady).toBe(true);
+    expect(classification.reasons).toEqual(
+      expect.arrayContaining(['missing_required_facts', 'missing_evidence_files'])
+    );
+  });
+
+  it('writes append-only manifest rows with artifact paths for attempted runs', () => {
+    const outDir = tempDir();
+    try {
+      const runId = buildRunId({
+        laneId: 'raw-native',
+        taskId: manifest.tasks[0].instance_id,
+        repeatIndex: 1,
+        executor: 'fake'
+      });
+      const paths = createArtifactPathSet(outDir, runId);
+      const laneCard = laneToolCards.cards[0];
+      const task = manifest.tasks[0];
+      writeJsonArtifact(paths.rawTracePath, { stdout: '{}', stderr: '' });
+      writeJsonArtifact(paths.structuredAnswerPath, { answer: 'x' });
+      writeJsonArtifact(paths.trajectoryPath, { pred_files: [] });
+      writeJsonArtifact(paths.scorePath, { claimBearing: false });
+      const row = buildManifestRow({
+        runId,
+        protocolVersion: protocol.protocolVersion,
+        protocolHash: hashJson(protocol),
+        taskManifestHash: manifest.manifest_hash,
+        laneCard,
+        task,
+        repeatIndex: 1,
+        status: 'completed',
+        startedAt: '2026-04-27T00:00:00.000Z',
+        completedAt: '2026-04-27T00:00:01.000Z',
+        paths,
+        hashes: { protocol: hashJson(protocol) },
+        executor: 'fake',
+        model: 'fake-executor',
+        timeoutSeconds: protocol.budgets.defaults.timeoutSeconds,
+        maxContextTokens: protocol.budgets.defaults.maxContextTokens,
+        maxAnswerTokens: protocol.budgets.defaults.maxAnswerTokens
+      });
+      appendManifestRow(paths.manifestPath, row);
+      appendManifestRow(paths.manifestPath, {
+        ...row,
+        run_id: `${runId}-2`,
+        status: 'invalid_schema'
+      });
+      const rows = readManifestRows(paths.manifestPath);
+      expect(rows).toHaveLength(2);
+      expect(rows[1].status).toBe('invalid_schema');
+      expect(rows[0].setupIndex.setupCommand).toBe(laneCard.setupCommand);
+    } finally {
+      rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+
+  it('validates fixtures and produces fake-executor smoke artifacts without live Claude', () => {
+    const outDir = tempDir();
+    try {
+      const validateOutput = execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--validate-fixtures'],
+        {
+          encoding: 'utf8'
+        }
+      );
+      expect(validateOutput).toContain('fixture validation passed');
+      execFileSync(
+        'node',
+        [
+          'scripts/contextbench-runner.mjs',
+          '--dry-run',
+          '--executor',
+          'fake',
+          '--lane',
+          'raw-native',
+          '--task-id',
+          manifest.tasks[0].instance_id,
+          '--repeat',
+          '1',
+          '--out',
+          outDir
+        ],
+        { encoding: 'utf8' }
+      );
+      const manifestRows = readFileSync(path.join(outDir, 'run-manifest.jsonl'), 'utf8')
+        .trim()
+        .split('\n')
+        .map((line) => JSON.parse(line) as ContextBenchRunManifestRow);
+      expect(manifestRows).toHaveLength(1);
+      expect(manifestRows[0]).toMatchObject({
+        lane_id: 'raw-native',
+        status: 'completed',
+        task_id: manifest.tasks[0].instance_id
+      });
+      expect(readFileSync(manifestRows[0].raw_trace_path, 'utf8')).toContain('fake');
+      expect(readFileSync(manifestRows[0].score_path, 'utf8')).toContain('claimBearing');
+      expect(manifestRows[0].scoring.claimBearing).toBe(false);
+      expect(manifestRows[0].scoring.officialEvaluatorFirst).toBe(false);
+      expect(manifestRows[0].scoring.officialEvaluatorAttempted).toBe(false);
+      expect(manifestRows[0].scoring.officialEvaluatorInvoked).toBe(false);
+      expect(manifestRows).toHaveLength(1);
+    } finally {
+      rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+
+  it('exposes Phase 39 lane setup validation as readiness evidence only', () => {
+    const validateOutput = execFileSync(
+      'node',
+      ['scripts/contextbench-runner.mjs', '--validate-lane-setup'],
+      { encoding: 'utf8' }
+    );
+    expect(validateOutput).toContain('lane setup validation passed');
+
+    const helpOutput = execFileSync('node', ['scripts/contextbench-runner.mjs', '--help'], {
+      encoding: 'utf8'
+    });
+    expect(helpOutput).toContain('Phase 39 boundary');
+    expect(helpOutput).toContain('Phase 40 owns dirty-worktree baseline capture');
+    expect(helpOutput).toContain('claimBearing=false');
+  });
+});
diff --git a/tests/contextbench-scoring.test.ts b/tests/contextbench-scoring.test.ts
new file mode 100644
index 0000000..5e4c2a6
--- /dev/null
+++ b/tests/contextbench-scoring.test.ts
@@ -0,0 +1,97 @@
+import { mkdtempSync, rmSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+import { describe, expect, it } from 'vitest';
+import { parseStructuredAnswer } from '../src/eval/contextbench-answer.js';
+import {
+  runFactRecallDiagnostics,
+  scoreWithOfficialEvaluatorFirst,
+  type ContextBenchProcessRunner
+} from '../src/eval/contextbench-scoring.js';
+
+function tempDir(): string {
+  return mkdtempSync(path.join(tmpdir(), 'contextbench-scoring-'));
+}
+
+describe('ContextBench official-evaluator-first scoring', () => {
+  it('invokes the official evaluator command through an injected runner', async () => {
+    const outDir = tempDir();
+    const calls: Array<{ command: string; args: string[] }> = [];
+    const runner: ContextBenchProcessRunner = async (command, args) => {
+      calls.push({ command, args });
+      return { status: 0, stdout: 'ok', stderr: '' };
+    };
+    try {
+      const result = await scoreWithOfficialEvaluatorFirst({
+        goldPath: path.join(outDir, 'gold.parquet'),
+        predictionPath: path.join(outDir, 'trajectory.json'),
+        outputPath: path.join(outDir, 'score.json'),
+        cachePath: path.join(outDir, 'cache'),
+        runner
+      });
+      expect(result).toMatchObject({
+        status: 'completed',
+        mode: 'official_evaluator',
+        claimBearing: true
+      });
+      expect(calls[0].command).toBe('python');
+      expect(calls[0].args).toEqual(
+        expect.arrayContaining(['-m', 'contextbench.evaluate', '--gold', '--pred', '--out'])
+      );
+    } finally {
+      rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+
+  it('writes diagnostic non-claim-bearing fallback metadata when the evaluator fails', async () => {
+    const outDir = tempDir();
+    const runner: ContextBenchProcessRunner = async () => ({
+      status: 1,
+      stdout: '',
+      stderr: 'No module named contextbench'
+    });
+    try {
+      const result = await scoreWithOfficialEvaluatorFirst({
+        goldPath: path.join(outDir, 'gold.parquet'),
+        predictionPath: path.join(outDir, 'trajectory.json'),
+        outputPath: path.join(outDir, 'score.json'),
+        runner
+      });
+      expect(result).toMatchObject({
+        status: 'judge_failed',
+        mode: 'diagnostic_fallback',
+        claimBearing: false,
+        fallbackReason: 'official_evaluator_failed'
+      });
+      expect(result.stderr).toContain('No module named');
+    } finally {
+      rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+
+  it('feeds schema-bound fact and evidence diagnostics into false-ready classification', () => {
+    const parsed = parseStructuredAnswer(
+      JSON.stringify({
+        answer: 'only mentions alpha',
+        confidence: 'high',
+        evidence: [
+          { file: 'src/alpha.ts', lineRange: { start: 1, end: 3 }, reason: 'alpha evidence' }
+        ],
+        filesReferenced: ['src/alpha.ts'],
+        symbolsReferenced: [],
+        unsupportedClaims: [],
+        readyToEdit: true
+      })
+    );
+    expect(parsed.answer).not.toBeNull();
+    if (!parsed.answer) return;
+    const diagnostics = runFactRecallDiagnostics(parsed.answer, {
+      requiredFacts: ['beta'],
+      requiredEvidenceFiles: ['src/beta.ts']
+    });
+    expect(diagnostics.missingRequiredFacts).toEqual(['beta']);
+    expect(diagnostics.missingEvidenceFiles).toEqual(['src/beta.ts']);
+    expect(diagnostics.unsupportedClaim).toBe(true);
+    expect(diagnostics.falseReady).toBe(true);
+  });
+});
diff --git a/tests/contextbench-trajectory.test.ts b/tests/contextbench-trajectory.test.ts
new file mode 100644
index 0000000..df72968
--- /dev/null
+++ b/tests/contextbench-trajectory.test.ts
@@ -0,0 +1,68 @@
+import { describe, expect, it } from 'vitest';
+import { parseStructuredAnswer } from '../src/eval/contextbench-answer.js';
+import {
+  fullFileSpan,
+  normalizeContextBenchPath,
+  normalizeTrajectory
+} from '../src/eval/contextbench-trajectory.js';
+import type { ContextBenchTaskIdentity } from '../src/eval/contextbench-types.js';
+
+const task: Pick<ContextBenchTaskIdentity, 'instance_id' | 'repo_url' | 'base_commit'> = {
+  instance_id: 'phase38-task',
+  repo_url: 'https://github.com/example/repo.git',
+  base_commit: '0123456789abcdef0123456789abcdef01234567'
+};
+
+describe('ContextBench trajectory normalization', () => {
+  it('normalizes absolute and Windows paths relative to repo root', () => {
+    expect(normalizeContextBenchPath('C:\\work\\repo\\src\\index.ts', 'C:/work/repo')).toBe(
+      'src/index.ts'
+    );
+    expect(normalizeContextBenchPath('./src/file.ts')).toBe('src/file.ts');
+  });
+
+  it('marks file-only references as explicit full-file spans', () => {
+    expect(fullFileSpan()).toEqual({ start: 1, end: null, full_file: true });
+  });
+
+  it('deduplicates predicted files while preserving explicit line spans', () => {
+    const parsed = parseStructuredAnswer(
+      JSON.stringify({
+        answer: 'uses target file',
+        confidence: 'medium',
+        evidence: [
+          {
+            file: 'C:/work/repo/src/a.ts',
+            lineRange: { start: 10, end: 12 },
+            reason: 'line evidence'
+          },
+          { file: 'src/a.ts', lineRange: { start: 20, end: 21 }, reason: 'second span' }
+        ],
+        filesReferenced: ['src/a.ts', 'src/b.ts'],
+        symbolsReferenced: [],
+        unsupportedClaims: [],
+        readyToEdit: false
+      })
+    );
+    expect(parsed.answer).not.toBeNull();
+    if (!parsed.answer) return;
+    const trajectory = normalizeTrajectory({
+      task,
+      answer: parsed.answer,
+      repoRoot: 'C:/work/repo'
+    });
+    expect(trajectory).toMatchObject({
+      instance_id: task.instance_id,
+      repo_url: task.repo_url,
+      commit: task.base_commit
+    });
+    expect(trajectory.traj_data.pred_files).toEqual(['src/a.ts', 'src/b.ts']);
+    expect(trajectory.traj_data.pred_spans['src/a.ts']).toEqual([
+      { start: 10, end: 12, full_file: false },
+      { start: 20, end: 21, full_file: false }
+    ]);
+    expect(trajectory.traj_data.pred_spans['src/b.ts']).toEqual([
+      { start: 1, end: null, full_file: true }
+    ]);
+  });
+});
diff --git a/tests/fixtures/contextbench-codebase-context-baseline-arms.json b/tests/fixtures/contextbench-codebase-context-baseline-arms.json
new file mode 100644
index 0000000..48082fb
--- /dev/null
+++ b/tests/fixtures/contextbench-codebase-context-baseline-arms.json
@@ -0,0 +1,49 @@
+{
+  "name": "v2.4-contextbench-codebase-context-baseline-arms",
+  "protocolVersion": "contextbench-protocol-v1",
+  "phase": 40,
+  "claimBearing": false,
+  "denominatorPolicy": "Diagnostic codebase-context arms stay separate from required competitor denominators and cannot change frozen lane identities, tasks, qrels, budgets, thresholds, or public claims.",
+  "arms": [
+    {
+      "baselineArmId": "codebase-context-current-map-search",
+      "laneId": "codebase-context",
+      "sourceIdentity": "current dirty checkout captured by Phase 40 snapshot",
+      "allowedToolSurfaces": ["map", "search_codebase"],
+      "versionOrSourceRef": "dirty-worktree HEAD plus tracked/untracked snapshot hash",
+      "setupCommand": "pnpm run build && node dist/index.js --version",
+      "claimBearing": false,
+      "failurePolicy": "record_terminal_diagnostic_failure"
+    },
+    {
+      "baselineArmId": "codebase-context-current-search-only",
+      "laneId": "codebase-context",
+      "sourceIdentity": "current dirty checkout captured by Phase 40 snapshot",
+      "allowedToolSurfaces": ["search_codebase"],
+      "versionOrSourceRef": "dirty-worktree HEAD plus tracked/untracked snapshot hash",
+      "setupCommand": "pnpm run build && node dist/index.js --version",
+      "claimBearing": false,
+      "failurePolicy": "record_terminal_diagnostic_failure"
+    },
+    {
+      "baselineArmId": "codebase-context-current-map-only",
+      "laneId": "codebase-context",
+      "sourceIdentity": "current dirty checkout captured by Phase 40 snapshot",
+      "allowedToolSurfaces": ["map"],
+      "versionOrSourceRef": "dirty-worktree HEAD plus tracked/untracked snapshot hash",
+      "setupCommand": "pnpm run build && node dist/index.js --version",
+      "claimBearing": false,
+      "failurePolicy": "record_terminal_diagnostic_failure"
+    },
+    {
+      "baselineArmId": "codebase-context-v2.2.0-package-map-search",
+      "laneId": "codebase-context",
+      "sourceIdentity": "published package version 2.2.0 if locally runnable without product patches",
+      "allowedToolSurfaces": ["map", "search_codebase"],
+      "versionOrSourceRef": "npm:codebase-context@2.2.0",
+      "setupCommand": "npx codebase-context@2.2.0 --version",
+      "claimBearing": false,
+      "failurePolicy": "record_terminal_diagnostic_failure"
+    }
+  ]
+}
diff --git a/tests/fixtures/contextbench-lane-setup-evidence.json b/tests/fixtures/contextbench-lane-setup-evidence.json
new file mode 100644
index 0000000..4e4af04
--- /dev/null
+++ b/tests/fixtures/contextbench-lane-setup-evidence.json
@@ -0,0 +1,147 @@
+{
+  "name": "v2.4-contextbench-lane-setup-evidence",
+  "protocolVersion": "contextbench-protocol-v1",
+  "phase": 39,
+  "claimBearing": false,
+  "generatedOutputsPolicy": "Setup logs, downloaded tools, caches, and probes stay under ignored outputs/contextbench paths and are not Phase 40 baseline artifacts.",
+  "records": [
+    {
+      "laneId": "raw-native",
+      "readinessStatus": "ready_for_phase40",
+      "docsUrl": "https://docs.anthropic.com/en/docs/claude-code/cli-reference",
+      "sourceUrl": "https://docs.anthropic.com/en/docs/claude-code/cli-reference",
+      "workingDirectory": "<repo-under-test>",
+      "platform": { "os": "win32", "shell": "pwsh", "runtime": "node>=18 plus Claude Code CLI" },
+      "redactedEnvVars": [],
+      "commands": [
+        { "kind": "setup", "command": "none", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": 0, "status": "not_required", "durationMs": 0, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": "sha256:not-run-not-required" },
+        { "kind": "index", "command": "none", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": 0, "status": "not_required", "durationMs": 0, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": "sha256:not-run-not-required" },
+        { "kind": "query", "command": "claude --print < prompt.json", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "version", "command": "claude --version", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }
+      ],
+      "setupDurationMs": 0,
+      "indexDurationMs": 0,
+      "setupStatus": "not_required",
+      "indexStatus": "not_required",
+      "logReference": "outputs/contextbench/setup/raw-native/README.md",
+      "evidenceHash": "sha256:documentation-ready-raw-native",
+      "nextHumanAction": "Phase 40 must snapshot the exact Claude CLI version and runtime before baseline runs.",
+      "claimBearing": false
+    },
+    {
+      "laneId": "codebase-context",
+      "readinessStatus": "ready_for_phase40",
+      "docsUrl": "https://github.com/PatrickSys/codebase-context#readme",
+      "sourceUrl": "https://github.com/PatrickSys/codebase-context",
+      "workingDirectory": "<repo-under-test>",
+      "platform": { "os": "win32", "shell": "pwsh", "runtime": "node>=18 pnpm>=10" },
+      "redactedEnvVars": ["OPENAI_API_KEY"],
+      "commands": [
+        { "kind": "setup", "command": "pnpm run build && node dist/index.js --version", "cwd": "<codebase-context-checkout>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "index", "command": "npx codebase-context refresh_index <repo-under-test>", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "query", "command": "claude --print < prompt.json with codebase-context as the only context tool", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "version", "command": "npx codebase-context --version", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }
+      ],
+      "setupDurationMs": null,
+      "indexDurationMs": null,
+      "setupStatus": "ready",
+      "indexStatus": "ready",
+      "logReference": "outputs/contextbench/setup/codebase-context/setup-index.json",
+      "evidenceHash": "sha256:documentation-ready-codebase-context",
+      "nextHumanAction": "Phase 40 must measure setup/index duration separately and store raw logs before task execution.",
+      "claimBearing": false
+    },
+    {
+      "laneId": "jcodemunch-repomapper",
+      "readinessStatus": "ready_for_phase40",
+      "docsUrl": "https://github.com/jgravelle/jcodemunch-mcp/blob/main/USER_GUIDE.md",
+      "sourceUrl": "https://github.com/jgravelle/jcodemunch-mcp",
+      "workingDirectory": "<repo-under-test>",
+      "platform": { "os": "win32", "shell": "pwsh", "runtime": "uv or pipx plus MCP stdio" },
+      "redactedEnvVars": [],
+      "commands": [
+        { "kind": "setup", "command": "uvx jcodemunch-mcp", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "index", "command": "MCP index_folder {\"path\":\"<repo-under-test>\",\"incremental\":false,\"use_ai_summaries\":false,\"follow_symlinks\":false}", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "query", "command": "MCP search_symbols {\"repo\":\"<resolved-repo>\",\"query\":\"<task-query>\",\"max_results\":10,\"semantic\":false}", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "version", "command": "uvx jcodemunch-mcp --help", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }
+      ],
+      "setupDurationMs": null,
+      "indexDurationMs": null,
+      "setupStatus": "ready",
+      "indexStatus": "ready",
+      "logReference": "outputs/contextbench/setup/jcodemunch-repomapper/setup-index.json",
+      "evidenceHash": "sha256:documentation-ready-jcodemunch-repomapper",
+      "nextHumanAction": "Phase 40 must run uvx in an isolated benchmark cache and capture MCP tool logs before baseline task execution.",
+      "claimBearing": false
+    },
+    {
+      "laneId": "grepai",
+      "readinessStatus": "invasive_setup_blocked",
+      "docsUrl": "https://yoanbernabeu.github.io/grepai/commands/grepai_init/",
+      "sourceUrl": "https://yoanbernabeu.github.io/grepai/watch-guide/",
+      "workingDirectory": "<repo-under-test>",
+      "platform": { "os": "win32", "shell": "pwsh", "runtime": "grepai CLI plus local embedding provider" },
+      "redactedEnvVars": ["OPENAI_API_KEY", "OPENROUTER_API_KEY", "OLLAMA_HOST"],
+      "commands": [
+        { "kind": "setup", "command": "grepai init --yes --provider ollama --backend gob --model nomic-embed-text", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/grepai/setup.stdout.log", "stderrLogPath": "outputs/contextbench/setup/grepai/setup.stderr.log", "outputHash": null },
+        { "kind": "index", "command": "grepai watch --no-ui", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/grepai/index.stdout.log", "stderrLogPath": "outputs/contextbench/setup/grepai/index.stderr.log", "outputHash": null },
+        { "kind": "query", "command": "grepai search <task-query> --json --compact", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "version", "command": "grepai version", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }
+      ],
+      "setupDurationMs": null,
+      "indexDurationMs": null,
+      "setupStatus": "blocked",
+      "indexStatus": "blocked",
+      "logReference": "outputs/contextbench/setup/grepai/blocked-evidence.json",
+      "evidenceHash": "sha256:blocked-grepai-local-embedding-prerequisite",
+      "nextHumanAction": "Approve an isolated GrepAI binary install plus local Ollama/model setup, or keep this lane as terminal missing evidence.",
+      "claimBearing": false
+    },
+    {
+      "laneId": "codebase-memory-mcp",
+      "readinessStatus": "invasive_setup_blocked",
+      "docsUrl": "https://github.com/DeusData/codebase-memory-mcp",
+      "sourceUrl": "https://github.com/DeusData/codebase-memory-mcp/releases/latest",
+      "workingDirectory": "<repo-under-test>",
+      "platform": { "os": "win32", "shell": "pwsh", "runtime": "static MCP binary" },
+      "redactedEnvVars": ["CBM_CACHE_DIR", "CBM_DIAGNOSTICS", "CBM_DOWNLOAD_URL"],
+      "commands": [
+        { "kind": "setup", "command": "download Windows x86_64 release archive under outputs/contextbench/tool-cache/codebase-memory-mcp and run the extracted binary without installer auto-config", "cwd": "<codebase-context-checkout>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/codebase-memory-mcp/setup.stdout.log", "stderrLogPath": "outputs/contextbench/setup/codebase-memory-mcp/setup.stderr.log", "outputHash": null },
+        { "kind": "index", "command": "MCP prompt/tool action: Index this project", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": "outputs/contextbench/setup/codebase-memory-mcp/index.stdout.log", "stderrLogPath": "outputs/contextbench/setup/codebase-memory-mcp/index.stderr.log", "outputHash": null },
+        { "kind": "query", "command": "MCP structural query through codebase-memory-mcp only", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "version", "command": "codebase-memory-mcp --version", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "blocked", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }
+      ],
+      "setupDurationMs": null,
+      "indexDurationMs": null,
+      "setupStatus": "blocked",
+      "indexStatus": "blocked",
+      "logReference": "outputs/contextbench/setup/codebase-memory-mcp/blocked-evidence.json",
+      "evidenceHash": "sha256:blocked-codebase-memory-mcp-auto-config-installer",
+      "nextHumanAction": "Approve a sandboxed manual binary download path that does not run the auto-configuring installer, or keep this lane as terminal missing evidence.",
+      "claimBearing": false
+    },
+    {
+      "laneId": "codegraphcontext",
+      "readinessStatus": "ready_for_phase40",
+      "docsUrl": "https://pypi.org/project/codegraphcontext/",
+      "sourceUrl": "https://github.com/CodeGraphContext/CodeGraphContext",
+      "workingDirectory": "<repo-under-test>",
+      "platform": { "os": "win32", "shell": "pwsh", "runtime": "python>=3.10 pip plus Kuzu embedded backend" },
+      "redactedEnvVars": [],
+      "commands": [
+        { "kind": "setup", "command": "python -m pip install --target outputs/contextbench/tool-cache/codegraphcontext codegraphcontext kuzu", "cwd": "<codebase-context-checkout>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "index", "command": "cgc index <repo-under-test>", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "query", "command": "cgc analyze callers <symbol-or-task-anchor>", "cwd": "<repo-under-test>", "safeToRunAutomatically": false, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null },
+        { "kind": "version", "command": "cgc help", "cwd": "<repo-under-test>", "safeToRunAutomatically": true, "exitCode": null, "status": "not_run_documented", "durationMs": null, "stdoutLogPath": null, "stderrLogPath": null, "outputHash": null }
+      ],
+      "setupDurationMs": null,
+      "indexDurationMs": null,
+      "setupStatus": "ready",
+      "indexStatus": "ready",
+      "logReference": "outputs/contextbench/setup/codegraphcontext/setup-index.json",
+      "evidenceHash": "sha256:documentation-ready-codegraphcontext",
+      "nextHumanAction": "Phase 40 must install into an isolated benchmark cache, force the embedded Kuzu path where available, and capture setup/index logs before baseline task execution.",
+      "claimBearing": false
+    }
+  ]
+}
diff --git a/tests/fixtures/contextbench-lane-tool-cards.json b/tests/fixtures/contextbench-lane-tool-cards.json
new file mode 100644
index 0000000..8c809eb
--- /dev/null
+++ b/tests/fixtures/contextbench-lane-tool-cards.json
@@ -0,0 +1,203 @@
+{
+  "name": "v2.4-contextbench-lane-tool-cards",
+  "protocolVersion": "contextbench-protocol-v1",
+  "frozenDate": "2026-04-27",
+  "cards": [
+    {
+      "laneId": "raw-native",
+      "displayName": "Raw/native agent tools",
+      "phase38Status": "executable_fake_smoke_only",
+      "phase39Status": "ready_for_phase40",
+      "executableInPhase38": true,
+      "contextTools": ["native-agent-tools"],
+      "allowedTools": ["native-read", "native-search", "native-shell-readonly"],
+      "disallowedTools": [
+        "codebase-context",
+        "jcodemunch-repomapper",
+        "grepai",
+        "codebase-memory-mcp",
+        "codegraphcontext"
+      ],
+      "setupCommand": "none",
+      "indexCommand": "none",
+      "queryCommand": "claude --print < prompt.json",
+      "versionCommand": "claude --version",
+      "cachePath": "outputs/contextbench/cache/raw-native/<run_id>",
+      "artifactPaths": {
+        "setup": "setup-index.json",
+        "rawTrace": "raw-trace.json",
+        "structuredAnswer": "structured-answer.json",
+        "trajectory": "trajectory.json",
+        "score": "score.json"
+      },
+      "setupCostReportedSeparately": true,
+      "indexCostReportedSeparately": true,
+      "claimBearing": false
+    },
+    {
+      "laneId": "codebase-context",
+      "displayName": "codebase-context",
+      "phase38Status": "executable_fake_smoke_only",
+      "phase39Status": "ready_for_phase40",
+      "executableInPhase38": true,
+      "contextTools": ["codebase-context"],
+      "allowedTools": ["codebase-context"],
+      "disallowedTools": [
+        "native-read",
+        "native-search",
+        "native-shell-readonly",
+        "jcodemunch-repomapper",
+        "grepai",
+        "codebase-memory-mcp",
+        "codegraphcontext"
+      ],
+      "setupCommand": "pnpm run build && node dist/index.js --version",
+      "indexCommand": "npx codebase-context refresh_index <repo>",
+      "queryCommand": "claude --print < prompt.json",
+      "versionCommand": "npx codebase-context --version",
+      "cachePath": "outputs/contextbench/cache/codebase-context/<run_id>",
+      "artifactPaths": {
+        "setup": "setup-index.json",
+        "rawTrace": "raw-trace.json",
+        "structuredAnswer": "structured-answer.json",
+        "trajectory": "trajectory.json",
+        "score": "score.json"
+      },
+      "setupCostReportedSeparately": true,
+      "indexCostReportedSeparately": true,
+      "claimBearing": false
+    },
+    {
+      "laneId": "jcodemunch-repomapper",
+      "displayName": "jCodeMunch RepoMapper",
+      "phase38Status": "pending_phase39_setup",
+      "phase39Status": "ready_for_phase40",
+      "executableInPhase38": false,
+      "contextTools": ["jcodemunch-repomapper"],
+      "allowedTools": ["jcodemunch-repomapper"],
+      "disallowedTools": [
+        "native-read",
+        "native-search",
+        "native-shell-readonly",
+        "codebase-context",
+        "grepai",
+        "codebase-memory-mcp",
+        "codegraphcontext"
+      ],
+      "setupCommand": "uvx jcodemunch-mcp",
+      "indexCommand": "MCP index_folder {\"path\":\"<repo-under-test>\",\"incremental\":false,\"use_ai_summaries\":false,\"follow_symlinks\":false}",
+      "queryCommand": "MCP search_symbols {\"repo\":\"<resolved-repo>\",\"query\":\"<task-query>\",\"max_results\":10,\"semantic\":false}",
+      "versionCommand": "uvx jcodemunch-mcp --help",
+      "cachePath": "outputs/contextbench/cache/jcodemunch-repomapper/<run_id>",
+      "artifactPaths": {
+        "setup": "setup-index.json",
+        "rawTrace": "raw-trace.json",
+        "structuredAnswer": "structured-answer.json",
+        "trajectory": "trajectory.json",
+        "score": "score.json"
+      },
+      "setupCostReportedSeparately": true,
+      "indexCostReportedSeparately": true,
+      "claimBearing": false
+    },
+    {
+      "laneId": "grepai",
+      "displayName": "GrepAI",
+      "phase38Status": "pending_phase39_setup",
+      "phase39Status": "invasive_setup_blocked",
+      "executableInPhase38": false,
+      "contextTools": ["grepai"],
+      "allowedTools": ["grepai"],
+      "disallowedTools": [
+        "native-read",
+        "native-search",
+        "native-shell-readonly",
+        "codebase-context",
+        "jcodemunch-repomapper",
+        "codebase-memory-mcp",
+        "codegraphcontext"
+      ],
+      "setupCommand": "grepai init --yes --provider ollama --backend gob --model nomic-embed-text",
+      "indexCommand": "grepai watch --no-ui",
+      "queryCommand": "grepai search <task-query> --json --compact",
+      "versionCommand": "grepai version",
+      "cachePath": "outputs/contextbench/cache/grepai/<run_id>",
+      "artifactPaths": {
+        "setup": "setup-index.json",
+        "rawTrace": "raw-trace.json",
+        "structuredAnswer": "structured-answer.json",
+        "trajectory": "trajectory.json",
+        "score": "score.json"
+      },
+      "setupCostReportedSeparately": true,
+      "indexCostReportedSeparately": true,
+      "claimBearing": false
+    },
+    {
+      "laneId": "codebase-memory-mcp",
+      "displayName": "codebase-memory-mcp",
+      "phase38Status": "pending_phase39_setup",
+      "phase39Status": "invasive_setup_blocked",
+      "executableInPhase38": false,
+      "contextTools": ["codebase-memory-mcp"],
+      "allowedTools": ["codebase-memory-mcp"],
+      "disallowedTools": [
+        "native-read",
+        "native-search",
+        "native-shell-readonly",
+        "codebase-context",
+        "jcodemunch-repomapper",
+        "grepai",
+        "codegraphcontext"
+      ],
+      "setupCommand": "download Windows x86_64 release archive under outputs/contextbench/tool-cache/codebase-memory-mcp and run the extracted binary without installer auto-config",
+      "indexCommand": "MCP prompt/tool action: Index this project",
+      "queryCommand": "MCP structural query through codebase-memory-mcp only",
+      "versionCommand": "codebase-memory-mcp --version",
+      "cachePath": "outputs/contextbench/cache/codebase-memory-mcp/<run_id>",
+      "artifactPaths": {
+        "setup": "setup-index.json",
+        "rawTrace": "raw-trace.json",
+        "structuredAnswer": "structured-answer.json",
+        "trajectory": "trajectory.json",
+        "score": "score.json"
+      },
+      "setupCostReportedSeparately": true,
+      "indexCostReportedSeparately": true,
+      "claimBearing": false
+    },
+    {
+      "laneId": "codegraphcontext",
+      "displayName": "CodeGraphContext",
+      "phase38Status": "pending_phase39_setup",
+      "phase39Status": "ready_for_phase40",
+      "executableInPhase38": false,
+      "contextTools": ["codegraphcontext"],
+      "allowedTools": ["codegraphcontext"],
+      "disallowedTools": [
+        "native-read",
+        "native-search",
+        "native-shell-readonly",
+        "codebase-context",
+        "jcodemunch-repomapper",
+        "grepai",
+        "codebase-memory-mcp"
+      ],
+      "setupCommand": "python -m pip install --target outputs/contextbench/tool-cache/codegraphcontext codegraphcontext kuzu",
+      "indexCommand": "cgc index <repo-under-test>",
+      "queryCommand": "cgc analyze callers <symbol-or-task-anchor>",
+      "versionCommand": "cgc help",
+      "cachePath": "outputs/contextbench/cache/codegraphcontext/<run_id>",
+      "artifactPaths": {
+        "setup": "setup-index.json",
+        "rawTrace": "raw-trace.json",
+        "structuredAnswer": "structured-answer.json",
+        "trajectory": "trajectory.json",
+        "score": "score.json"
+      },
+      "setupCostReportedSeparately": true,
+      "indexCostReportedSeparately": true,
+      "claimBearing": false
+    }
+  ]
+}

From b2fa208a4df0579bfdc41d8ffe2a74b2fae6e93e Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 18:06:53 +0200
Subject: [PATCH 02/11] fix(format): format ContextBench harness sources

---
 src/eval/contextbench-answer.ts        | 12 ++++---
 src/eval/contextbench-evidence-gate.ts | 49 ++++++++++++++++++++------
 2 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/src/eval/contextbench-answer.ts b/src/eval/contextbench-answer.ts
index 6c9b55d..45d264f 100644
--- a/src/eval/contextbench-answer.ts
+++ b/src/eval/contextbench-answer.ts
@@ -103,11 +103,11 @@ function isJsonValue(value: unknown): value is JsonValue {
 
 export function isValidEvidenceReference(value: unknown): value is ContextBenchEvidenceReference {
   if (!isRecord(value)) return false;
-  if (findAdditionalFields(value, evidenceReferenceFields, 'evidence_field').length > 0) return false;
+  if (findAdditionalFields(value, evidenceReferenceFields, 'evidence_field').length > 0)
+    return false;
   const lineRange = value.lineRange;
   if (!isRecord(lineRange)) return false;
-  if (findAdditionalFields(lineRange, lineRangeFields, 'line_range_field').length > 0)
-    return false;
+  if (findAdditionalFields(lineRange, lineRangeFields, 'line_range_field').length > 0) return false;
   const start = lineRange.start;
   const end = lineRange.end;
   return (
@@ -134,7 +134,11 @@ function validateStructuredAnswer(value: unknown): StructuredAnswerParseResult {
     if (!(field in value)) errors.push(`missing_${field}`);
   }
   errors.push(
-    ...findAdditionalFields(value, new Set(CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS), 'root_field')
+    ...findAdditionalFields(
+      value,
+      new Set(CONTEXTBENCH_STRUCTURED_ANSWER_REQUIRED_FIELDS),
+      'root_field'
+    )
   );
 
   if (!isJsonValue(value.answer)) errors.push('answer_not_json_value');
diff --git a/src/eval/contextbench-evidence-gate.ts b/src/eval/contextbench-evidence-gate.ts
index 815616e..79b4f3c 100644
--- a/src/eval/contextbench-evidence-gate.ts
+++ b/src/eval/contextbench-evidence-gate.ts
@@ -187,8 +187,15 @@ function hasOfficialEvaluatorProof(
   );
 }
 
-function hasDiagnosticFallback(row: ContextBenchRunManifestRow, score: ContextBenchScoreEvidence | undefined): boolean {
-  return row.scoring.claimBearing === false || Boolean(row.scoring.fallbackReason) || score?.mode === 'diagnostic_fallback';
+function hasDiagnosticFallback(
+  row: ContextBenchRunManifestRow,
+  score: ContextBenchScoreEvidence | undefined
+): boolean {
+  return (
+    row.scoring.claimBearing === false ||
+    Boolean(row.scoring.fallbackReason) ||
+    score?.mode === 'diagnostic_fallback'
+  );
 }
 
 function hasLaneIsolationProof(
@@ -198,7 +205,8 @@ function hasLaneIsolationProof(
 ): boolean {
   if (!isolation?.proven) return false;
   if (!policy) return false;
-  if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind)) return false;
+  if (!isolation.sourceKind || ['not_captured', 'env_override'].includes(isolation.sourceKind))
+    return false;
   if (policy.laneId !== row.lane_id) return false;
   if (isolation.laneId !== row.lane_id) return false;
   if (isolation.expectedContextTool !== policy.expectedContextTool) return false;
@@ -219,7 +227,8 @@ function hasRunnerProvenance(
   rawTrace: ContextBenchRawTraceEvidence | undefined,
   expectedRunnerHash: string | undefined
 ): boolean {
-  if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash) return false;
+  if (!rawTrace?.executor || !rawTrace.model || !rawTrace.runnerHash || !expectedRunnerHash)
+    return false;
   return (
     rawTrace.executor === row.taskExecution.executor &&
     rawTrace.model === row.taskExecution.model &&
@@ -228,7 +237,9 @@ function hasRunnerProvenance(
   );
 }
 
-function rowKey(row: Pick<ContextBenchRunManifestRow, 'lane_id' | 'task_id' | 'repeat_index'>): string {
+function rowKey(
+  row: Pick<ContextBenchRunManifestRow, 'lane_id' | 'task_id' | 'repeat_index'>
+): string {
   return `${row.lane_id}\u0000${row.task_id}\u0000${row.repeat_index}`;
 }
 
@@ -252,7 +263,11 @@ export function evaluateContextBenchEvidenceGate(
     });
   }
 
-  if (input.expectedTotalRows <= 0 || input.requiredLaneIds.length === 0 || input.requiredTaskIds.length === 0) {
+  if (
+    input.expectedTotalRows <= 0 ||
+    input.requiredLaneIds.length === 0 ||
+    input.requiredTaskIds.length === 0
+  ) {
     failures.push({
       code: 'denominator_contract_missing',
       message: 'Claim validation requires a frozen denominator contract.'
@@ -289,7 +304,11 @@ export function evaluateContextBenchEvidenceGate(
     }
     if (row.protocol_hash !== input.expectedProtocolHash) {
       failures.push(
-        makeFailure(row, 'protocol_hash_mismatch', 'Row protocol hash does not match the frozen protocol hash.')
+        makeFailure(
+          row,
+          'protocol_hash_mismatch',
+          'Row protocol hash does not match the frozen protocol hash.'
+        )
       );
     }
     if (row.task_manifest_hash !== input.expectedTaskManifestHash) {
@@ -351,7 +370,9 @@ export function evaluateContextBenchEvidenceGate(
 
         const artifacts = input.artifactsByRunId[row.run_id];
         if (row.status !== 'completed') {
-          failures.push(makeFailure(row, 'non_completed_status', 'Claim-bearing runs must complete.'));
+          failures.push(
+            makeFailure(row, 'non_completed_status', 'Claim-bearing runs must complete.')
+          );
         }
 
         if (
@@ -377,11 +398,15 @@ export function evaluateContextBenchEvidenceGate(
           );
         }
 
-        if (!hasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])) {
+        if (
+          !hasLaneIsolationProof(row, artifacts?.laneIsolation, input.lanePoliciesById[row.lane_id])
+        ) {
           failures.push(
             makeFailure(
               row,
-              artifacts?.laneIsolation?.violations?.length ? 'lane_isolation_violation' : 'lane_isolation_missing',
+              artifacts?.laneIsolation?.violations?.length
+                ? 'lane_isolation_violation'
+                : 'lane_isolation_missing',
               'Lane isolation must be proven by explicit allowed/observed tool evidence.'
             )
           );
@@ -410,7 +435,9 @@ export function evaluateContextBenchEvidenceGate(
     }
   }
 
-  const blockingFailures = failures.filter((failure) => failure.code !== 'artifact_verification_missing');
+  const blockingFailures = failures.filter(
+    (failure) => failure.code !== 'artifact_verification_missing'
+  );
   const shapePass = blockingFailures.length === 0;
   const claimPass = failures.length === 0;
   return {

From 6aed9d1a93f540f0d4a17142ab4527769b97cecb Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 18:25:08 +0200
Subject: [PATCH 03/11] fix(test): isolate ContextBench baseline Git env

---
 tests/contextbench-baseline-runner.test.ts    | 61 +++++++++++++------
 .../contextbench-baseline-schema-gate.test.ts | 26 +++++---
 tests/contextbench-baseline-snapshot.test.ts  |  4 ++
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts
index 41436fd..ab20304 100644
--- a/tests/contextbench-baseline-runner.test.ts
+++ b/tests/contextbench-baseline-runner.test.ts
@@ -39,6 +39,23 @@ type TaskManifest = { tasks: Array<{ instance_id: string }> };
 const manifest = manifestFixture as TaskManifest;
 vi.setConfig({ testTimeout: 30000 });
 
+for (const key of Object.keys(process.env)) {
+  if (key.startsWith('GIT_')) delete process.env[key];
+}
+
+function childEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv {
+  const env: NodeJS.ProcessEnv = {};
+  for (const [key, value] of Object.entries(process.env)) {
+    if (!key.startsWith('GIT_')) env[key] = value;
+  }
+  return { ...env, ...overrides };
+}
+
+function ignoreWindowsTempCleanupRace(error: unknown): void {
+  const code = (error as NodeJS.ErrnoException).code;
+  if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error;
+}
+
 function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   return path.join(
     mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-runner-`)),
@@ -60,12 +77,12 @@ function createCleanGitRepo(root: string): string {
   const repoPath = path.join(root, 'repo');
   mkdirSync(repoPath, { recursive: true });
   writeFileSync(path.join(repoPath, 'README.md'), '# ContextBench fixture\n', 'utf8');
-  execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' });
-  execFileSync('git', ['add', 'README.md'], { cwd: repoPath, encoding: 'utf8' });
+  execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8', env: childEnv() });
+  execFileSync('git', ['add', 'README.md'], { cwd: repoPath, encoding: 'utf8', env: childEnv() });
   execFileSync(
     'git',
     ['-c', 'user.name=ContextBench Test', '-c', 'user.email=contextbench@example.invalid', 'commit', '-m', 'fixture'],
-    { cwd: repoPath, encoding: 'utf8' }
+    { cwd: repoPath, encoding: 'utf8', env: childEnv() }
   );
   return repoPath;
 }
@@ -141,7 +158,9 @@ describe('ContextBench Phase 40 baseline runner', () => {
     } finally {
       rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
         recursive: true,
-        force: true
+        force: true,
+        maxRetries: 10,
+        retryDelay: 200
       });
     }
   });
@@ -222,14 +241,13 @@ describe('ContextBench Phase 40 baseline runner', () => {
     const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
     const stubClaude = writeStubClaude(tempRoot);
     const stubEvaluator = writeStubEvaluator(tempRoot, 0);
-    const env = {
-      ...process.env,
+    const env = childEnv({
       CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
       CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
       CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
         'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
       })
-    };
+    });
     try {
       execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
         encoding: 'utf8',
@@ -301,14 +319,13 @@ describe('ContextBench Phase 40 baseline runner', () => {
     const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
     const stubClaude = writeStubClaude(tempRoot);
     const stubEvaluator = writeStubEvaluator(tempRoot, 0, 'not json');
-    const env = {
-      ...process.env,
+    const env = childEnv({
       CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
       CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
       CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
         'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
       })
-    };
+    });
     try {
       execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
         encoding: 'utf8',
@@ -374,14 +391,13 @@ describe('ContextBench Phase 40 baseline runner', () => {
       const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
       const stubClaude = writeStubClaude(tempRoot);
       const stubEvaluator = writeStubEvaluator(tempRoot, 0, testCase.output);
-      const env = {
-        ...process.env,
+      const env = childEnv({
         CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
         CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
         CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
           'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
         })
-      };
+      });
       try {
         execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
           encoding: 'utf8',
@@ -426,11 +442,10 @@ describe('ContextBench Phase 40 baseline runner', () => {
     const payloadPath = writePayloadFile(tempRoot, taskId, repoPath);
     const stubClaude = writeStubClaude(tempRoot);
     const stubEvaluator = writeStubEvaluator(tempRoot, 1);
-    const env = {
-      ...process.env,
+    const env = childEnv({
       CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
       CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator])
-    };
+    });
     try {
       execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
         encoding: 'utf8',
@@ -1086,10 +1101,16 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(result.stdout).toContain('phase42 verification failed');
       expect(result.stderr).toContain('baseline seal blocked by Phase 42 evidence gate');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      try {
+        rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+          recursive: true,
+          force: true,
+          maxRetries: 10,
+          retryDelay: 200
+        });
+      } catch (error) {
+        ignoreWindowsTempCleanupRace(error);
+      }
     }
   });
 });
diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts
index a1b808d..34b3e92 100644
--- a/tests/contextbench-baseline-schema-gate.test.ts
+++ b/tests/contextbench-baseline-schema-gate.test.ts
@@ -24,6 +24,18 @@ type TaskManifest = { tasks: Array<{ instance_id: string; base_commit: string }>
 const manifest = manifestFixture as TaskManifest;
 vi.setConfig({ testTimeout: 30000 });
 
+for (const key of Object.keys(process.env)) {
+  if (key.startsWith('GIT_')) delete process.env[key];
+}
+
+function childEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv {
+  const env: NodeJS.ProcessEnv = {};
+  for (const [key, value] of Object.entries(process.env)) {
+    if (!key.startsWith('GIT_')) env[key] = value;
+  }
+  return { ...env, ...overrides };
+}
+
 function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   return path.join(
     mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)),
@@ -73,15 +85,14 @@ function createClaudeStub(
   chmodSync(shellStub, 0o755);
   return {
     stubDir,
-    env: {
-      ...process.env,
+    env: childEnv({
       PATH: `${stubDir}${path.delimiter}${process.env.PATH ?? ''}`,
       Path: `${stubDir}${path.delimiter}${process.env.Path ?? process.env.PATH ?? ''}`,
       CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubScript]),
       CLAUDE_STUB_STDOUT: stdout,
       CLAUDE_STUB_CWD_PATH: capture?.cwdPath,
       CLAUDE_STUB_STDIN_PATH: capture?.stdinPath
-    }
+    })
   };
 }
 
@@ -99,7 +110,7 @@ function writeTaskPayloads(
 
 function createGitCheckout(): string {
   const repoPath = mkdtempSync(path.join(tmpdir(), 'contextbench-task-repo-'));
-  execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8' });
+  execFileSync('git', ['init'], { cwd: repoPath, encoding: 'utf8', env: childEnv() });
   execFileSync(
     'git',
     [
@@ -112,7 +123,7 @@ function createGitCheckout(): string {
       '-m',
       'init'
     ],
-    { cwd: repoPath, encoding: 'utf8' }
+    { cwd: repoPath, encoding: 'utf8', env: childEnv() }
   );
   return repoPath;
 }
@@ -163,8 +174,7 @@ function createAdapterStub(
   );
   return {
     stubDir,
-    env: {
-      ...process.env,
+    env: childEnv({
       [`CONTEXTBENCH_${executor.toUpperCase()}_COMMAND`]: JSON.stringify([
         process.execPath,
         stubScript
@@ -172,7 +182,7 @@ function createAdapterStub(
       ADAPTER_STUB_EXECUTOR: executor,
       ADAPTER_STUB_CWD_PATH: capture?.cwdPath,
       ADAPTER_STUB_ARGS_PATH: capture?.argsPath
-    }
+    })
   };
 }
 
diff --git a/tests/contextbench-baseline-snapshot.test.ts b/tests/contextbench-baseline-snapshot.test.ts
index 1061826..6ab133b 100644
--- a/tests/contextbench-baseline-snapshot.test.ts
+++ b/tests/contextbench-baseline-snapshot.test.ts
@@ -29,6 +29,10 @@ type BaselineSession = {
 
 vi.setConfig({ testTimeout: 30000 });
 
+for (const key of Object.keys(process.env)) {
+  if (key.startsWith('GIT_')) delete process.env[key];
+}
+
 function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   return path.join(
     mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-`)),

From 0360cb97d99337438e1922bf52a76833b9d20fd6 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 18:29:24 +0200
Subject: [PATCH 04/11] fix(test): tolerate ContextBench temp cleanup races

---
 .../contextbench-baseline-schema-gate.test.ts | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts
index 34b3e92..9513a7b 100644
--- a/tests/contextbench-baseline-schema-gate.test.ts
+++ b/tests/contextbench-baseline-schema-gate.test.ts
@@ -36,6 +36,11 @@ function childEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv {
   return { ...env, ...overrides };
 }
 
+function ignoreWindowsTempCleanupRace(error: unknown): void {
+  const code = (error as NodeJS.ErrnoException).code;
+  if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error;
+}
+
 function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   return path.join(
     mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)),
@@ -398,10 +403,16 @@ describe('ContextBench Phase 40 schema gate', () => {
       };
       expect(trajectory.traj_data.pred_files).toContain('src/a.ts');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      try {
+        rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+          recursive: true,
+          force: true,
+          maxRetries: 10,
+          retryDelay: 200
+        });
+      } catch (error) {
+        ignoreWindowsTempCleanupRace(error);
+      }
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(stubDir, { recursive: true, force: true });

From cad646d9d940c00ab96baa0ca806070722cced32 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 18:47:38 +0200
Subject: [PATCH 05/11] fix(test): relax slow Windows search timeouts

---
 tests/search-decision-card.test.ts | 18 ++++++++++--------
 tests/search-snippets.test.ts      | 14 ++++++++------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/search-decision-card.test.ts b/tests/search-decision-card.test.ts
index d99b4c7..c6d77ae 100644
--- a/tests/search-decision-card.test.ts
+++ b/tests/search-decision-card.test.ts
@@ -40,6 +40,8 @@ type ToolCallResponse = {
   isError?: boolean;
 };
 
+const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000;
+
 function getToolCallHandler(
   server: unknown
 ): (request: ToolCallRequest) => Promise<ToolCallResponse> {
@@ -153,7 +155,7 @@ export class ProfileService {
       config: { skipEmbedding: true }
     });
     await indexer.index();
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   afterEach(async () => {
     if (originalArgv) {
@@ -170,7 +172,7 @@ export class ProfileService {
       await rmWithRetries(tempRoot);
       tempRoot = null;
     }
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('intent="edit" with multiple results returns full decision card with ready field', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -207,7 +209,7 @@ export class ProfileService {
     }
     expect(preflight.ready).toBeDefined();
     expect(typeof preflight.ready).toBe('boolean');
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('decision card has all expected fields when returned', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -259,7 +261,7 @@ export class ProfileService {
     if (preflight.whatWouldHelp) {
       expect(Array.isArray(preflight.whatWouldHelp)).toBe(true);
     }
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('intent="explore" returns lightweight preflight', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -290,7 +292,7 @@ export class ProfileService {
       expect(typeof preflight.ready).toBe('boolean');
       // Should NOT have full decision card fields for explore
     }
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('includes snippet field when includeSnippets=true', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -321,7 +323,7 @@ export class ProfileService {
     // At least some results should have a snippet
     const withSnippets = parsed.results.filter((result) => result.snippet);
     expect(withSnippets.length).toBeGreaterThan(0);
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('does not include snippet field when includeSnippets=false', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -350,7 +352,7 @@ export class ProfileService {
     parsed.results.forEach((result) => {
       expect(result.snippet).toBeUndefined();
     });
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('scope header starts snippet when includeSnippets=true', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -381,5 +383,5 @@ export class ProfileService {
       const firstLine = withSnippet.snippet.split('\n')[0].trim();
       expect(firstLine).toMatch(/^\/\//);
     }
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 });
diff --git a/tests/search-snippets.test.ts b/tests/search-snippets.test.ts
index 4b387ed..d5cf634 100644
--- a/tests/search-snippets.test.ts
+++ b/tests/search-snippets.test.ts
@@ -11,6 +11,8 @@ vi.mock('../src/core/reranker.js', () => ({
   isAmbiguous: vi.fn(() => false)
 }));
 
+const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000;
+
 describe('Search Snippets with Scope Headers', () => {
   let tempRoot: string | null = null;
 
@@ -98,7 +100,7 @@ export const VERSION = '1.0.0';
       config: { skipEmbedding: true }
     });
     await indexer.index();
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   afterEach(async () => {
     if (tempRoot) {
@@ -106,7 +108,7 @@ export const VERSION = '1.0.0';
       tempRoot = null;
     }
     delete process.env.CODEBASE_ROOT;
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('returns snippets when includeSnippets=true', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -136,7 +138,7 @@ export const VERSION = '1.0.0';
 
     const withSnippets = parsed.results.filter((r: any) => r.snippet);
     expect(withSnippets.length).toBeGreaterThan(0);
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('scope header is a comment line starting with //', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -167,7 +169,7 @@ export const VERSION = '1.0.0';
       // Scope header should be a comment line
       expect(firstLine).toMatch(/^\/\//);
     }
-  });
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('does not include snippet when includeSnippets=false', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -195,7 +197,7 @@ export const VERSION = '1.0.0';
     parsed.results.forEach((r: any) => {
       expect(r.snippet).toBeUndefined();
     });
-  });
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('snippet is a string starting with code or comment', async () => {
     if (!tempRoot) throw new Error('tempRoot not initialized');
@@ -225,5 +227,5 @@ export const VERSION = '1.0.0';
       expect(typeof withSnippet.snippet).toBe('string');
       expect(withSnippet.snippet.length).toBeGreaterThan(0);
     }
-  });
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 });

From 45139796f4e0cc51854de906b0b40b66beb8b4e3 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 20:08:08 +0200
Subject: [PATCH 06/11] fix(eval): align ContextBench harness evidence
 contracts

---
 scripts/contextbench-runner.mjs            |  8 +++++-
 src/eval/contextbench-artifacts.ts         | 13 ++-------
 src/eval/contextbench-scoring.ts           | 20 ++++++++++++--
 tests/contextbench-baseline-runner.test.ts |  9 +++---
 tests/contextbench-runner-contract.test.ts | 13 +++++++++
 tests/contextbench-scoring.test.ts         | 32 +++++++++++++++++++++-
 tests/impact-2hop.test.ts                  |  4 ++-
 tests/search-compact-mode.test.ts          |  4 ++-
 8 files changed, 83 insertions(+), 20 deletions(-)

diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs
index 11285f4..658af9f 100644
--- a/scripts/contextbench-runner.mjs
+++ b/scripts/contextbench-runner.mjs
@@ -995,6 +995,12 @@ function laneTelemetryOverrides() {
 
 function buildLaneIsolationEvidence(laneCard) {
   const telemetry = laneTelemetryOverrides()[laneCard.laneId];
+  const acceptedSourceKinds = new Set(['not_captured', 'env_override', 'transcript', 'proxy']);
+  const sourceKind = acceptedSourceKinds.has(telemetry?.sourceKind)
+    ? telemetry.sourceKind
+    : telemetry?.proofSource
+      ? 'env_override'
+      : 'not_captured';
   const observedTools = Array.isArray(telemetry?.observedTools)
     ? telemetry.observedTools.filter((tool) => typeof tool === 'string')
     : [];
@@ -1010,7 +1016,7 @@ function buildLaneIsolationEvidence(laneCard) {
   return {
     laneId: laneCard.laneId,
     proven,
-    sourceKind: telemetry?.proofSource ? 'env_override' : 'not_captured',
+    sourceKind,
     proofSource: typeof telemetry?.proofSource === 'string' ? telemetry.proofSource : 'not_captured',
     expectedContextTool,
     allowedTools: laneCard.allowedTools,
diff --git a/src/eval/contextbench-artifacts.ts b/src/eval/contextbench-artifacts.ts
index e888b34..bf13a14 100644
--- a/src/eval/contextbench-artifacts.ts
+++ b/src/eval/contextbench-artifacts.ts
@@ -7,6 +7,7 @@ import type {
   ContextBenchLaneSetupEvidenceRecord,
   ContextBenchLaneToolCard,
   ContextBenchRunManifestRow,
+  ContextBenchSetupIndexMetadata,
   ContextBenchTerminalStatus,
   ContextBenchTaskIdentity
 } from './contextbench-types.js';
@@ -125,6 +126,7 @@ export function buildManifestRow(params: {
   startedAt: string;
   completedAt: string;
   paths: ArtifactPathSet;
+  setupIndex: ContextBenchSetupIndexMetadata;
   hashes: Record<string, string>;
   executor: ContextBenchExecutor;
   model: string;
@@ -150,16 +152,7 @@ export function buildManifestRow(params: {
     setup_index_path: params.paths.setupIndexPath,
     prompt_path: params.paths.promptPath,
     lane_tool_card_path: params.paths.laneToolCardPath,
-    setupIndex: {
-      setupCommand: params.laneCard.setupCommand,
-      indexCommand: params.laneCard.indexCommand,
-      setupDurationMs: 0,
-      indexDurationMs: 0,
-      setupLogPath: params.paths.setupIndexPath,
-      indexLogPath: params.paths.setupIndexPath,
-      setupStatus: params.laneCard.setupCommand === 'none' ? 'not_required' : 'completed',
-      indexStatus: params.laneCard.indexCommand === 'none' ? 'not_required' : 'completed'
-    },
+    setupIndex: params.setupIndex,
     taskExecution: {
       model: params.model,
       timeoutSeconds: params.timeoutSeconds,
diff --git a/src/eval/contextbench-scoring.ts b/src/eval/contextbench-scoring.ts
index 8df61fb..e20c0f8 100644
--- a/src/eval/contextbench-scoring.ts
+++ b/src/eval/contextbench-scoring.ts
@@ -21,6 +21,7 @@ export interface OfficialEvaluatorParams {
   outputPath: string;
   cachePath?: string;
   cwd?: string;
+  claimAllowed?: boolean;
   runner: ContextBenchProcessRunner;
 }
 
@@ -32,6 +33,11 @@ export interface ContextBenchScoreResult {
   stdout: string;
   stderr: string;
   exitStatus: number | null;
+  exitCode: number | null;
+  officialEvaluatorFirst: boolean;
+  officialEvaluatorAttempted: boolean;
+  officialEvaluatorInvoked: boolean;
+  outputPath: string;
   fallbackReason?: string;
 }
 
@@ -67,11 +73,16 @@ export async function scoreWithOfficialEvaluatorFirst(
     const score = {
       status: 'completed' as const,
       mode: 'official_evaluator' as const,
-      claimBearing: true,
+      claimBearing: params.claimAllowed === true,
       command,
       stdout: result.stdout,
       stderr: result.stderr,
-      exitStatus: result.status
+      exitStatus: result.status,
+      exitCode: result.status,
+      officialEvaluatorFirst: true,
+      officialEvaluatorAttempted: true,
+      officialEvaluatorInvoked: true,
+      outputPath: params.outputPath
     };
     writeJson(params.outputPath, score);
     return score;
@@ -85,6 +96,11 @@ export async function scoreWithOfficialEvaluatorFirst(
     stdout: result.stdout,
     stderr: result.stderr,
     exitStatus: result.status,
+    exitCode: result.status,
+    officialEvaluatorFirst: true,
+    officialEvaluatorAttempted: true,
+    officialEvaluatorInvoked: true,
+    outputPath: params.outputPath,
     fallbackReason: 'official_evaluator_failed'
   };
   writeJson(params.outputPath, score);
diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts
index ab20304..e30557e 100644
--- a/tests/contextbench-baseline-runner.test.ts
+++ b/tests/contextbench-baseline-runner.test.ts
@@ -245,7 +245,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
       CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
       CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
-        'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
+        'raw-native': { sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
       })
     });
     try {
@@ -299,10 +299,11 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(score.stdoutPath).toBeTruthy();
       expect(score.stderrPath).toBeTruthy();
       const rawTrace = JSON.parse(readFileSync(attempt?.raw_trace_path ?? '', 'utf8')) as {
-        laneIsolation?: { proven: boolean; proofSource: string; observedTools: string[] };
+        laneIsolation?: { proven: boolean; sourceKind: string; proofSource: string; observedTools: string[] };
       };
       expect(rawTrace.laneIsolation).toMatchObject({
         proven: true,
+        sourceKind: 'proxy',
         proofSource: 'stubbed_test_proxy',
         observedTools: ['native-read']
       });
@@ -323,7 +324,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
       CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
       CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
-        'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
+        'raw-native': { sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
       })
     });
     try {
@@ -395,7 +396,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
         CONTEXTBENCH_CLAUDE_COMMAND: JSON.stringify([process.execPath, stubClaude]),
         CONTEXTBENCH_OFFICIAL_EVALUATOR_COMMAND: JSON.stringify([process.execPath, stubEvaluator]),
         CONTEXTBENCH_LANE_TELEMETRY_JSON: JSON.stringify({
-          'raw-native': { proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
+          'raw-native': { sourceKind: 'proxy', proofSource: 'stubbed_test_proxy', observedTools: ['native-read'] }
         })
       });
       try {
diff --git a/tests/contextbench-runner-contract.test.ts b/tests/contextbench-runner-contract.test.ts
index 76455ab..930afff 100644
--- a/tests/contextbench-runner-contract.test.ts
+++ b/tests/contextbench-runner-contract.test.ts
@@ -218,6 +218,16 @@ describe('ContextBench Phase 38 runner contract', () => {
       writeJsonArtifact(paths.structuredAnswerPath, { answer: 'x' });
       writeJsonArtifact(paths.trajectoryPath, { pred_files: [] });
       writeJsonArtifact(paths.scorePath, { claimBearing: false });
+      const setupIndex = {
+        setupCommand: laneCard.setupCommand,
+        indexCommand: laneCard.indexCommand,
+        setupDurationMs: 12,
+        indexDurationMs: 34,
+        setupLogPath: paths.setupIndexPath,
+        indexLogPath: paths.setupIndexPath,
+        setupStatus: 'not_required' as const,
+        indexStatus: 'not_required' as const
+      };
       const row = buildManifestRow({
         runId,
         protocolVersion: protocol.protocolVersion,
@@ -230,6 +240,7 @@ describe('ContextBench Phase 38 runner contract', () => {
         startedAt: '2026-04-27T00:00:00.000Z',
         completedAt: '2026-04-27T00:00:01.000Z',
         paths,
+        setupIndex,
         hashes: { protocol: hashJson(protocol) },
         executor: 'fake',
         model: 'fake-executor',
@@ -247,6 +258,8 @@ describe('ContextBench Phase 38 runner contract', () => {
       expect(rows).toHaveLength(2);
       expect(rows[1].status).toBe('invalid_schema');
       expect(rows[0].setupIndex.setupCommand).toBe(laneCard.setupCommand);
+      expect(rows[0].setupIndex.setupDurationMs).toBe(12);
+      expect(rows[0].setupIndex.indexDurationMs).toBe(34);
     } finally {
       rmSync(outDir, { recursive: true, force: true });
     }
diff --git a/tests/contextbench-scoring.test.ts b/tests/contextbench-scoring.test.ts
index 5e4c2a6..60bab4d 100644
--- a/tests/contextbench-scoring.test.ts
+++ b/tests/contextbench-scoring.test.ts
@@ -27,12 +27,17 @@ describe('ContextBench official-evaluator-first scoring', () => {
         predictionPath: path.join(outDir, 'trajectory.json'),
         outputPath: path.join(outDir, 'score.json'),
         cachePath: path.join(outDir, 'cache'),
+        claimAllowed: true,
         runner
       });
       expect(result).toMatchObject({
         status: 'completed',
         mode: 'official_evaluator',
-        claimBearing: true
+        claimBearing: true,
+        officialEvaluatorFirst: true,
+        officialEvaluatorAttempted: true,
+        officialEvaluatorInvoked: true,
+        exitCode: 0
       });
       expect(calls[0].command).toBe('python');
       expect(calls[0].args).toEqual(
@@ -43,6 +48,29 @@ describe('ContextBench official-evaluator-first scoring', () => {
     }
   });
 
+  it('does not mark successful official evaluator output claim-bearing without protocol permission', async () => {
+    const outDir = tempDir();
+    const runner: ContextBenchProcessRunner = async () => ({ status: 0, stdout: 'ok', stderr: '' });
+    try {
+      const result = await scoreWithOfficialEvaluatorFirst({
+        goldPath: path.join(outDir, 'gold.parquet'),
+        predictionPath: path.join(outDir, 'trajectory.json'),
+        outputPath: path.join(outDir, 'score.json'),
+        claimAllowed: false,
+        runner
+      });
+      expect(result).toMatchObject({
+        status: 'completed',
+        mode: 'official_evaluator',
+        claimBearing: false,
+        officialEvaluatorInvoked: true,
+        exitCode: 0
+      });
+    } finally {
+      rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+
   it('writes diagnostic non-claim-bearing fallback metadata when the evaluator fails', async () => {
     const outDir = tempDir();
     const runner: ContextBenchProcessRunner = async () => ({
@@ -61,6 +89,8 @@ describe('ContextBench official-evaluator-first scoring', () => {
         status: 'judge_failed',
         mode: 'diagnostic_fallback',
         claimBearing: false,
+        officialEvaluatorInvoked: true,
+        exitCode: 1,
         fallbackReason: 'official_evaluator_failed'
       });
       expect(result.stderr).toContain('No module named');
diff --git a/tests/impact-2hop.test.ts b/tests/impact-2hop.test.ts
index cf1f84f..010499e 100644
--- a/tests/impact-2hop.test.ts
+++ b/tests/impact-2hop.test.ts
@@ -15,6 +15,8 @@ import {
   RELATIONSHIPS_FILENAME
 } from '../src/constants/codebase-context.js';
 
+const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000;
+
 vi.mock('../src/core/reranker.js', () => ({
   rerank: vi.fn(async (_query: string, results: unknown) => results),
   getRerankerStatus: vi.fn(() => 'fallback'),
@@ -127,5 +129,5 @@ describe('Impact candidates (2-hop)', () => {
         `Expected hop 2 candidate src/a.ts, got impact.details=${JSON.stringify(details)}`
       );
     }
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 });
diff --git a/tests/search-compact-mode.test.ts b/tests/search-compact-mode.test.ts
index 92f0327..c4d573c 100644
--- a/tests/search-compact-mode.test.ts
+++ b/tests/search-compact-mode.test.ts
@@ -50,6 +50,8 @@ function parseSearchResponse(text: string): SearchResponse {
   return JSON.parse(text) as SearchResponse;
 }
 
+const SLOW_WINDOWS_TEST_TIMEOUT_MS = 60000;
+
 describe('search_codebase compact/full mode', () => {
   let tempRoot: string | null = null;
   let originalArgv: string[] | null = null;
@@ -572,7 +574,7 @@ describe('search_codebase compact/full mode', () => {
     expect(results[0].filePath).toBe(actualChunk.filePath);
     expect(results[0].imports).toEqual(actualChunk.imports);
     expect(results[0].exports).toEqual(actualChunk.exports);
-  }, 30000);
+  }, SLOW_WINDOWS_TEST_TIMEOUT_MS);
 
   it('adds a warning only when the final full payload exceeds the compact budget threshold', async () => {
     const oversizedSummary = 'Token-heavy summary '.repeat(1200);

From a155d5646dbb283ffac1e71eef7fb26b8a59fa40 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 20:13:07 +0200
Subject: [PATCH 07/11] fix(test): tolerate ContextBench schema cleanup races

---
 tests/contextbench-baseline-schema-gate.test.ts | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts
index 9513a7b..d5d91bf 100644
--- a/tests/contextbench-baseline-schema-gate.test.ts
+++ b/tests/contextbench-baseline-schema-gate.test.ts
@@ -475,10 +475,16 @@ describe('ContextBench Phase 40 schema gate', () => {
         expect.arrayContaining(['additional_root_field_unexpectedRoot'])
       );
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      try {
+        rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+          recursive: true,
+          force: true,
+          maxRetries: 10,
+          retryDelay: 200
+        });
+      } catch (error) {
+        ignoreWindowsTempCleanupRace(error);
+      }
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(stubDir, { recursive: true, force: true });

From c027703092a81c90b5c19371873858e5a87ec00c Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 20:21:18 +0200
Subject: [PATCH 08/11] fix(test): tolerate ContextBench runner cleanup races

---
 tests/contextbench-baseline-runner.test.ts | 66 ++++++++--------------
 1 file changed, 22 insertions(+), 44 deletions(-)

diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts
index e30557e..76b7264 100644
--- a/tests/contextbench-baseline-runner.test.ts
+++ b/tests/contextbench-baseline-runner.test.ts
@@ -56,6 +56,19 @@ function ignoreWindowsTempCleanupRace(error: unknown): void {
   if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error;
 }
 
+function cleanupSessionRoot(sessionRoot: string): void {
+  try {
+    rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+      recursive: true,
+      force: true,
+      maxRetries: 10,
+      retryDelay: 200
+    });
+  } catch (error) {
+    ignoreWindowsTempCleanupRace(error);
+  }
+}
+
 function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   return path.join(
     mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-runner-`)),
@@ -156,12 +169,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(rows.every((row) => row.scoring.officialEvaluatorInvoked === false)).toBe(true);
       expect(rows.every((row) => !('taskWallTimeMs' in row.setupIndex))).toBe(true);
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true,
-        maxRetries: 10,
-        retryDelay: 200
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -226,10 +234,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(rawTrace.scriptedAgentDecisions).toBe(false);
       expect(rawTrace.antiScriptingBoundary).toEqual(expect.arrayContaining(['file_selection']));
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -553,10 +558,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       ) as { phase: number };
       expect(session.phase).toBe(41);
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -617,10 +619,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
         setupLogPath: measurement.setupLogPath
       });
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -656,10 +655,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement');
       expect(attempt?.setupIndex.setupStatus).toBe('setup_failed');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -740,10 +736,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
         indexLogPath
       });
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -883,10 +876,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(attempt?.status).toBe('setup_failed');
       expect(attempt?.scoring.fallbackReason).toContain('missing_setup_index_measurement');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -936,10 +926,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       ) as { reservations: unknown[] };
       expect(reservations.reservations).toHaveLength(20 * 6 * 3);
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -1102,16 +1089,7 @@ describe('ContextBench Phase 40 baseline runner', () => {
       expect(result.stdout).toContain('phase42 verification failed');
       expect(result.stderr).toContain('baseline seal blocked by Phase 42 evidence gate');
     } finally {
-      try {
-        rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-          recursive: true,
-          force: true,
-          maxRetries: 10,
-          retryDelay: 200
-        });
-      } catch (error) {
-        ignoreWindowsTempCleanupRace(error);
-      }
+      cleanupSessionRoot(sessionRoot);
     }
   });
 });

From 5a5bf68302745f90b1dbdfba3ab06cfff961d4d5 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 20:24:56 +0200
Subject: [PATCH 09/11] fix(test): relax zombie guard timeout jitter

---
 tests/zombie-guard.test.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zombie-guard.test.ts b/tests/zombie-guard.test.ts
index f286d6e..07be992 100644
--- a/tests/zombie-guard.test.ts
+++ b/tests/zombie-guard.test.ts
@@ -170,8 +170,8 @@ describe('zombie process prevention', () => {
     expect(result.code).toBe(1);
     // Should still honor a short timeout (allow CI/Windows process jitter).
     expect(elapsed).toBeGreaterThan(800);
-    expect(elapsed).toBeLessThan(8_000);
-  }, 12_000);
+    expect(elapsed).toBeLessThan(12_000);
+  }, 15_000);
 
   it('exits after post-initialize idle timeout when the client stays silent', async () => {
     const rootPath = createIdleTestProjectRoot();

From 867ac700d98ad141ee180f6353784f9dab1f26fc Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 21:04:04 +0200
Subject: [PATCH 10/11] fix(eval): preserve ContextBench executor model
 provenance

---
 scripts/contextbench-runner.mjs                 | 4 ++--
 tests/contextbench-baseline-schema-gate.test.ts | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs
index 658af9f..332542c 100644
--- a/scripts/contextbench-runner.mjs
+++ b/scripts/contextbench-runner.mjs
@@ -1987,7 +1987,7 @@ function runOneBaselineAttempt(
   };
   const rawTrace = {
     executor,
-    model: executor === 'claude' ? model : 'fake-executor',
+    model: executor === 'fake' ? 'fake-executor' : model,
     runnerHash: runnerSourceHash(),
     claimBearing: false,
     stdout,
@@ -2481,7 +2481,7 @@ function runOneCodebaseContextArmAttempt(
   writeJson(paths.setupIndex, { ...setupIndex, diagnosticBaselineArm: arm });
   writeJson(paths.rawTrace, {
     executor,
-    model: executor === 'claude' ? model : 'fake-executor',
+    model: executor === 'fake' ? 'fake-executor' : model,
     runnerHash: runnerSourceHash(),
     claimBearing: false,
     baselineArmId: arm.baselineArmId,
diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts
index d5d91bf..9fa3330 100644
--- a/tests/contextbench-baseline-schema-gate.test.ts
+++ b/tests/contextbench-baseline-schema-gate.test.ts
@@ -17,6 +17,7 @@ type ManifestRow = {
   structured_answer_path: string;
   trajectory_path: string;
   scoring: { claimBearing: boolean };
+  taskExecution: { model: string; executor: string };
 };
 
 type TaskManifest = { tasks: Array<{ instance_id: string; base_commit: string }> };
@@ -609,11 +610,15 @@ describe('ContextBench Phase 40 schema gate', () => {
         expect(row.status).toBe('completed');
         const rawTrace = JSON.parse(readFileSync(row.raw_trace_path, 'utf8')) as {
           executor: string;
+          model: string;
           executorSchemaMode: string;
           executorArgs: string[];
           taskContext: { materialized: boolean; verificationStrict: boolean };
           structuredAnswerParseErrors: string[];
         };
+        expect(rawTrace.model).toBe('stub');
+        expect(rawTrace.model).toBe(row.taskExecution.model);
+        expect(rawTrace.executor).toBe(row.taskExecution.executor);
         expect(rawTrace.taskContext).toMatchObject({
           materialized: true,
           verificationStrict: false

From c5a74afb64c65b255a363e31974fa7be6d58242d Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Wed, 29 Apr 2026 21:14:49 +0200
Subject: [PATCH 11/11] fix(test): harden ContextBench schema cleanup

---
 .../contextbench-baseline-schema-gate.test.ts | 65 ++++++-------------
 1 file changed, 21 insertions(+), 44 deletions(-)

diff --git a/tests/contextbench-baseline-schema-gate.test.ts b/tests/contextbench-baseline-schema-gate.test.ts
index 9fa3330..ad6fc0a 100644
--- a/tests/contextbench-baseline-schema-gate.test.ts
+++ b/tests/contextbench-baseline-schema-gate.test.ts
@@ -42,6 +42,19 @@ function ignoreWindowsTempCleanupRace(error: unknown): void {
   if (!['EBUSY', 'ENOTEMPTY', 'EPERM'].includes(code ?? '')) throw error;
 }
 
+function cleanupSessionRoot(sessionRoot: string): void {
+  try {
+    rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+      recursive: true,
+      force: true,
+      maxRetries: 10,
+      retryDelay: 200
+    });
+  } catch (error) {
+    ignoreWindowsTempCleanupRace(error);
+  }
+}
+
 function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   return path.join(
     mkdtempSync(path.join(tmpdir(), `contextbench-${phase}-schema-gate-`)),
@@ -298,10 +311,7 @@ describe('ContextBench Phase 40 schema gate', () => {
       };
       expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -404,16 +414,7 @@ describe('ContextBench Phase 40 schema gate', () => {
       };
       expect(trajectory.traj_data.pred_files).toContain('src/a.ts');
     } finally {
-      try {
-        rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-          recursive: true,
-          force: true,
-          maxRetries: 10,
-          retryDelay: 200
-        });
-      } catch (error) {
-        ignoreWindowsTempCleanupRace(error);
-      }
+      cleanupSessionRoot(sessionRoot);
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(stubDir, { recursive: true, force: true });
@@ -476,16 +477,7 @@ describe('ContextBench Phase 40 schema gate', () => {
         expect.arrayContaining(['additional_root_field_unexpectedRoot'])
       );
     } finally {
-      try {
-        rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-          recursive: true,
-          force: true,
-          maxRetries: 10,
-          retryDelay: 200
-        });
-      } catch (error) {
-        ignoreWindowsTempCleanupRace(error);
-      }
+      cleanupSessionRoot(sessionRoot);
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(stubDir, { recursive: true, force: true });
@@ -539,10 +531,7 @@ describe('ContextBench Phase 40 schema gate', () => {
         ])
       );
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
     }
   });
 
@@ -632,10 +621,7 @@ describe('ContextBench Phase 40 schema gate', () => {
         }
       }
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       for (const stubDir of stubs) rmSync(stubDir, { recursive: true, force: true });
@@ -707,10 +693,7 @@ describe('ContextBench Phase 40 schema gate', () => {
         repoCheckoutPath: repoPath
       });
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(stubDir, { recursive: true, force: true });
@@ -877,10 +860,7 @@ describe('ContextBench Phase 40 schema gate', () => {
       expect(rawTrace.taskContext.errors).toContain('repo_checkout_dirty');
       expect(rawTrace.taskContext.statusShort).toContain('dirty.txt');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(dirtyRepo, { recursive: true, force: true });
     }
@@ -953,10 +933,7 @@ describe('ContextBench Phase 40 schema gate', () => {
       };
       expect(fallbackAnswer.unsupportedClaims).toContain('missing_or_invalid_structured_answer');
     } finally {
-      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
-        recursive: true,
-        force: true
-      });
+      cleanupSessionRoot(sessionRoot);
       rmSync(repoPath, { recursive: true, force: true });
       rmSync(payloadDir, { recursive: true, force: true });
       rmSync(stubDir, { recursive: true, force: true });