Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 100 additions & 10 deletions packages/core/src/evaluation/orchestrator.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { execFile } from 'node:child_process';
import { createHash, randomUUID } from 'node:crypto';
import { existsSync } from 'node:fs';
import { copyFile, mkdir, readdir, stat } from 'node:fs/promises';
import path from 'node:path';
import { promisify } from 'node:util';
import micromatch from 'micromatch';
import pLimit from 'p-limit';

Expand Down Expand Up @@ -94,6 +96,9 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j

type MaybePromise<T> = T | Promise<T>;

const execFileAsync = promisify(execFile);
const WORKSPACE_GIT_TIMEOUT_MS = 300_000;

function classifyQualityStatus(score: number, threshold = DEFAULT_THRESHOLD): ExecutionStatus {
return score >= threshold ? 'ok' : 'quality_failure';
}
Expand Down Expand Up @@ -152,6 +157,43 @@ function hooksEnabled(
return workspace?.hooks?.enabled !== false;
}

function workspaceGitEnv(): Record<string, string | undefined> {
const env = { ...process.env };
for (const key of Object.keys(env)) {
if (key.startsWith('GIT_') && key !== 'GIT_SSH_COMMAND') {
delete env[key];
}
}
return {
...env,
GIT_TERMINAL_PROMPT: '0',
GIT_ASKPASS: '',
GIT_SSH_COMMAND: 'ssh -o BatchMode=yes',
};
}

async function resetWorkspaceRoot(
workspacePath: string,
resetMode: 'fast' | 'strict',
baselineRef?: string,
): Promise<boolean> {
if (!existsSync(path.join(workspacePath, '.git'))) {
return false;
}

const cleanFlag = resetMode === 'strict' ? '-fdx' : '-fd';
const opts = {
cwd: workspacePath,
timeout: WORKSPACE_GIT_TIMEOUT_MS,
env: workspaceGitEnv(),
maxBuffer: 50 * 1024 * 1024,
};

await execFileAsync('git', ['reset', '--hard', baselineRef ?? 'HEAD'], opts);
await execFileAsync('git', ['clean', cleanFlag], opts);
return true;
}

/**
* Extract workspaceTemplate from a resolved target's config.
* Returns undefined if the target doesn't support workspace templates.
Expand Down Expand Up @@ -1847,6 +1889,45 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
}
}

let beforeEachNeedsFreshBaseline = false;

// Apply before_each reset before any setup scripts run.
if (
caseHooksEnabled &&
workspacePath &&
evalCase.workspace?.hooks?.before_each?.reset &&
evalCase.workspace.hooks.before_each.reset !== 'none'
) {
try {
if (repoManager && evalCase.workspace.repos?.length) {
await repoManager.reset(
evalCase.workspace.repos,
workspacePath,
evalCase.workspace.hooks.before_each.reset,
);
} else {
await resetWorkspaceRoot(
workspacePath,
evalCase.workspace.hooks.before_each.reset,
sharedBaselineCommit,
);
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return buildErrorResult(
evalCase,
target.name,
nowFn(),
new Error(`before_each reset failed: ${message}`),
promptInputs,
provider,
'setup',
'script_error',
verbose,
);
}
}

// Execute before_each hook (runs before each test for any workspace)
const caseBeforeEachHook = evalCase.workspace?.hooks?.before_each;
if (workspacePath && caseHooksEnabled && hasHookCommand(caseBeforeEachHook)) {
Expand All @@ -1864,6 +1945,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
toScriptConfig(beforeEachHook, 'before_each', `test '${evalCase.id}'`),
scriptContext,
);
beforeEachNeedsFreshBaseline = true;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return buildErrorResult(
Expand All @@ -1883,7 +1965,9 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
// Initialize git baseline for file-change tracking.
// Runs git init + baseline commit before the agent, then diffs after.
// Supports nested repos via --submodule=diff.
let baselineCommit: string | undefined = sharedBaselineCommit;
let baselineCommit: string | undefined = beforeEachNeedsFreshBaseline
? undefined
: sharedBaselineCommit;
if (!baselineCommit && workspacePath) {
try {
baselineCommit = await initializeBaseline(workspacePath);
Expand Down Expand Up @@ -2075,21 +2159,27 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati

const providerError = extractProviderError(providerResponse);

// Reset repos before after_each hook (if configured)
// Reset workspace state before after_each hook (if configured)
if (
caseHooksEnabled &&
repoManager &&
workspacePath &&
evalCase.workspace?.hooks?.after_each?.reset &&
evalCase.workspace.hooks.after_each.reset !== 'none' &&
evalCase.workspace.repos
evalCase.workspace.hooks.after_each.reset !== 'none'
) {
try {
await repoManager.reset(
evalCase.workspace.repos,
workspacePath,
evalCase.workspace.hooks.after_each.reset,
);
if (repoManager && evalCase.workspace.repos?.length) {
await repoManager.reset(
evalCase.workspace.repos,
workspacePath,
evalCase.workspace.hooks.after_each.reset,
);
} else {
await resetWorkspaceRoot(
workspacePath,
evalCase.workspace.hooks.after_each.reset,
baselineCommit,
);
}
} catch {
// Reset failures are non-fatal (like after_each)
}
Expand Down
12 changes: 0 additions & 12 deletions packages/core/src/evaluation/validation/eval-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -466,18 +466,6 @@ function validateWorkspaceRepoConfig(
}
}

// Reset without repos warning
if (isObject(afterEachHook) && afterEachHook.reset && afterEachHook.reset !== 'none') {
if (!Array.isArray(repos) || repos.length === 0) {
errors.push({
severity: 'warning',
filePath,
location: 'workspace.hooks.after_each',
message: `hooks.after_each.reset '${afterEachHook.reset}' has no effect without repos.`,
});
}
}

// after_each reset with per_test isolation warning
if (isObject(afterEachHook) && afterEachHook.reset && isolation === 'per_test') {
errors.push({
Expand Down
113 changes: 113 additions & 0 deletions packages/core/test/evaluation/orchestrator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1761,6 +1761,119 @@ rl.on('close', () => {
expect(result.error).toBeUndefined();
expect(result.executionStatus).toBe('ok');
});

it('applies reset-only before_each hooks to a shared workspace root', async () => {
const { mkdtemp, writeFile, mkdir, readFile, access } = await import('node:fs/promises');
const { initializeBaseline } = await import('../../src/evaluation/workspace/file-changes.js');

testDir = await mkdtemp(path.join(tmpdir(), 'agentv-orch-shared-reset-'));
await mkdir(testDir, { recursive: true });
await writeFile(path.join(testDir, 'seed.txt'), 'clean\n');
const sharedBaselineCommit = await initializeBaseline(testDir);

await writeFile(path.join(testDir, 'seed.txt'), 'dirty\n');
await writeFile(path.join(testDir, 'stale.txt'), 'stale\n');

const provider = new SequenceProvider('mock', {
responses: [
{
output: [{ role: 'assistant', content: [{ type: 'text', text: 'answer' }] }],
},
],
});

const evalCase: EvalTest = {
...baseTestCase,
workspace: {
path: testDir,
hooks: {
before_each: {
reset: 'fast',
},
},
},
};

const result = await runEvalCase({
evalCase,
provider,
target: baseTarget,
evaluators: evaluatorRegistry,
evalRunId: 'test-run-shared-reset',
cleanupWorkspaces: false,
sharedWorkspacePath: testDir,
sharedBaselineCommit,
});

expect(result.error).toBeUndefined();
expect(result.executionStatus).toBe('ok');
expect((await readFile(path.join(testDir, 'seed.txt'), 'utf8')).trim()).toBe('clean');
await expect(access(path.join(testDir, 'stale.txt'))).rejects.toThrow();
});

it('refreshes the baseline after shared before_each scripts run', async () => {
const { mkdtemp, writeFile, readFile } = await import('node:fs/promises');
const { initializeBaseline } = await import('../../src/evaluation/workspace/file-changes.js');

testDir = await mkdtemp(path.join(tmpdir(), 'agentv-orch-shared-baseline-'));
const sharedBaselineCommit = await initializeBaseline(testDir);
const beforeEachScript = path.join(testDir, 'before-each.js');
writeFileSync(
beforeEachScript,
`const fs = require('node:fs');
const payload = JSON.parse(fs.readFileSync(0, 'utf8'));
fs.writeFileSync(require('node:path').join(payload.workspace_path, 'setup.txt'), 'setup from hook\\n');
`,
'utf8',
);

const provider: Provider = {
id: 'writer:mock',
kind: 'mock',
targetName: 'mock',
async invoke(request: ProviderRequest): Promise<ProviderResponse> {
const cwd = request.cwd;
if (!cwd) {
throw new Error('cwd was not provided');
}
writeFileSync(path.join(cwd, 'agent.txt'), 'agent output\n');
return {
output: [{ role: 'assistant', content: 'done' }],
};
},
};

const evalCase: EvalTest = {
...baseTestCase,
workspace: {
path: testDir,
hooks: {
before_each: {
command: [process.execPath, beforeEachScript],
},
},
},
};

const result = await runEvalCase({
evalCase,
provider,
target: baseTarget,
evaluators: evaluatorRegistry,
evalRunId: 'test-run-shared-before-each-script',
cleanupWorkspaces: false,
sharedWorkspacePath: testDir,
sharedBaselineCommit,
});

expect(result.error).toBeUndefined();
expect(result.executionStatus).toBe('ok');
expect((await readFile(path.join(testDir, 'setup.txt'), 'utf8')).trim()).toBe(
'setup from hook',
);
expect(result.fileChanges).toContain('agent.txt');
expect(result.fileChanges).not.toContain('setup.txt');
});
});

describe('deterministic assertion evaluators in orchestrator', () => {
Expand Down
Loading