diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 7430bc02..cfed68de 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1,7 +1,9 @@ +import { execFile } from 'node:child_process'; import { createHash, randomUUID } from 'node:crypto'; import { existsSync } from 'node:fs'; import { copyFile, mkdir, readdir, stat } from 'node:fs/promises'; import path from 'node:path'; +import { promisify } from 'node:util'; import micromatch from 'micromatch'; import pLimit from 'p-limit'; @@ -94,6 +96,9 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j type MaybePromise = T | Promise; +const execFileAsync = promisify(execFile); +const WORKSPACE_GIT_TIMEOUT_MS = 300_000; + function classifyQualityStatus(score: number, threshold = DEFAULT_THRESHOLD): ExecutionStatus { return score >= threshold ? 'ok' : 'quality_failure'; } @@ -152,6 +157,43 @@ function hooksEnabled( return workspace?.hooks?.enabled !== false; } +function workspaceGitEnv(): Record { + const env = { ...process.env }; + for (const key of Object.keys(env)) { + if (key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND') { + delete env[key]; + } + } + return { + ...env, + GIT_TERMINAL_PROMPT: '0', + GIT_ASKPASS: '', + GIT_SSH_COMMAND: 'ssh -o BatchMode=yes', + }; +} + +async function resetWorkspaceRoot( + workspacePath: string, + resetMode: 'fast' | 'strict', + baselineRef?: string, +): Promise { + if (!existsSync(path.join(workspacePath, '.git'))) { + return false; + } + + const cleanFlag = resetMode === 'strict' ? '-fdx' : '-fd'; + const opts = { + cwd: workspacePath, + timeout: WORKSPACE_GIT_TIMEOUT_MS, + env: workspaceGitEnv(), + maxBuffer: 50 * 1024 * 1024, + }; + + await execFileAsync('git', ['reset', '--hard', baselineRef ?? 'HEAD'], opts); + await execFileAsync('git', ['clean', cleanFlag], opts); + return true; +} + /** * Extract workspaceTemplate from a resolved target's config. * Returns undefined if the target doesn't support workspace templates. @@ -1847,6 +1889,45 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { expect(result.error).toBeUndefined(); expect(result.executionStatus).toBe('ok'); }); + + it('applies reset-only before_each hooks to a shared workspace root', async () => { + const { mkdtemp, writeFile, mkdir, readFile, access } = await import('node:fs/promises'); + const { initializeBaseline } = await import('../../src/evaluation/workspace/file-changes.js'); + + testDir = await mkdtemp(path.join(tmpdir(), 'agentv-orch-shared-reset-')); + await mkdir(testDir, { recursive: true }); + await writeFile(path.join(testDir, 'seed.txt'), 'clean\n'); + const sharedBaselineCommit = await initializeBaseline(testDir); + + await writeFile(path.join(testDir, 'seed.txt'), 'dirty\n'); + await writeFile(path.join(testDir, 'stale.txt'), 'stale\n'); + + const provider = new SequenceProvider('mock', { + responses: [ + { + output: [{ role: 'assistant', content: [{ type: 'text', text: 'answer' }] }], + }, + ], + }); + + const evalCase: EvalTest = { + ...baseTestCase, + workspace: { + path: testDir, + hooks: { + before_each: { + reset: 'fast', + }, + }, + }, + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + evalRunId: 'test-run-shared-reset', + cleanupWorkspaces: false, + sharedWorkspacePath: testDir, + sharedBaselineCommit, + }); + + expect(result.error).toBeUndefined(); + expect(result.executionStatus).toBe('ok'); + expect((await readFile(path.join(testDir, 'seed.txt'), 'utf8')).trim()).toBe('clean'); + await expect(access(path.join(testDir, 'stale.txt'))).rejects.toThrow(); + }); + + it('refreshes the baseline after shared before_each scripts run', async () => { + const { mkdtemp, writeFile, readFile } = await import('node:fs/promises'); + const { initializeBaseline } = await import('../../src/evaluation/workspace/file-changes.js'); + + testDir = await mkdtemp(path.join(tmpdir(), 'agentv-orch-shared-baseline-')); + const sharedBaselineCommit = await initializeBaseline(testDir); + const beforeEachScript = path.join(testDir, 'before-each.js'); + writeFileSync( + beforeEachScript, + `const fs = require('node:fs'); +const payload = JSON.parse(fs.readFileSync(0, 'utf8')); +fs.writeFileSync(require('node:path').join(payload.workspace_path, 'setup.txt'), 'setup from hook\\n'); +`, + 'utf8', + ); + + const provider: Provider = { + id: 'writer:mock', + kind: 'mock', + targetName: 'mock', + async invoke(request: ProviderRequest): Promise { + const cwd = request.cwd; + if (!cwd) { + throw new Error('cwd was not provided'); + } + writeFileSync(path.join(cwd, 'agent.txt'), 'agent output\n'); + return { + output: [{ role: 'assistant', content: 'done' }], + }; + }, + }; + + const evalCase: EvalTest = { + ...baseTestCase, + workspace: { + path: testDir, + hooks: { + before_each: { + command: [process.execPath, beforeEachScript], + }, + }, + }, + }; + + const result = await runEvalCase({ + evalCase, + provider, + target: baseTarget, + evaluators: evaluatorRegistry, + evalRunId: 'test-run-shared-before-each-script', + cleanupWorkspaces: false, + sharedWorkspacePath: testDir, + sharedBaselineCommit, + }); + + expect(result.error).toBeUndefined(); + expect(result.executionStatus).toBe('ok'); + expect((await readFile(path.join(testDir, 'setup.txt'), 'utf8')).trim()).toBe( + 'setup from hook', + ); + expect(result.fileChanges).toContain('agent.txt'); + expect(result.fileChanges).not.toContain('setup.txt'); + }); }); describe('deterministic assertion evaluators in orchestrator', () => {