diff --git a/apps/cli/src/commands/results/remote.ts b/apps/cli/src/commands/results/remote.ts index 3969480a..70d82552 100644 --- a/apps/cli/src/commands/results/remote.ts +++ b/apps/cli/src/commands/results/remote.ts @@ -290,7 +290,10 @@ export async function ensureRemoteRunAvailable( throw new Error(`Remote manifest path is outside the results repo clone: ${meta.path}`); } - const relativeRunPath = path.posix.relative('runs', path.posix.dirname(relativeManifestPath)); + const relativeRunPath = path.posix.relative( + '.agentv/results/runs', + path.posix.dirname(relativeManifestPath), + ); await materializeGitRun(config.path, relativeRunPath); } diff --git a/apps/cli/src/commands/results/serve.ts b/apps/cli/src/commands/results/serve.ts index 746d487e..a7632a89 100644 --- a/apps/cli/src/commands/results/serve.ts +++ b/apps/cli/src/commands/results/serve.ts @@ -376,12 +376,14 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { let target: string | undefined; let experiment = inferExperimentFromRunId(m.raw_filename); let passRate = m.passRate; + let avgScore = m.avgScore; try { const records = await loadLightweightResultsForMeta(searchDir, m); if (records.length > 0) { target = records[0].target; experiment = records[0].experiment ?? experiment; passRate = records.filter((r) => r.score >= passThreshold).length / records.length; + avgScore = records.reduce((sum, r) => sum + r.score, 0) / records.length; } else { // Run is in-progress with 0 results written yet — fall back to the // in-memory target stored when the Studio launched this run. @@ -402,7 +404,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) { timestamp: m.timestamp, test_count: m.testCount, pass_rate: passRate, - avg_score: m.avgScore, + avg_score: avgScore, size_bytes: m.sizeBytes, source: m.source, ...(target && { target }), diff --git a/apps/cli/test/commands/results/serve.test.ts b/apps/cli/test/commands/results/serve.test.ts index e4fd28c7..a7b0e938 100644 --- a/apps/cli/test/commands/results/serve.test.ts +++ b/apps/cli/test/commands/results/serve.test.ts @@ -107,7 +107,7 @@ function writeRemoteRunArtifact( /^(\d{4}-\d{2}-\d{2})T(\d{2})-(\d{2})-(\d{2})-(\d{3})Z$/, '$1T$2:$3:$4.$5Z', ); - const runDir = path.join(cloneDir, 'runs', experiment, timestamp); + const runDir = path.join(cloneDir, '.agentv', 'results', 'runs', experiment, timestamp); mkdirSync(runDir, { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultRecord)); writeFileSync( @@ -653,6 +653,8 @@ describe('serve app', () => { process.env.AGENTV_HOME, 'results', 'EntityProcess-agentv-evals', + '.agentv', + 'results', 'runs', 'default', '2026-03-26T10-00-00-000Z', @@ -705,7 +707,13 @@ describe('serve app', () => { const listRes = await app.request('/api/runs'); expect(listRes.status).toBe(200); const listData = (await listRes.json()) as { - runs: Array<{ filename: string; source: string; experiment?: string; pass_rate?: number }>; + runs: Array<{ + filename: string; + source: string; + experiment?: string; + pass_rate?: number; + avg_score?: number; + }>; }; expect(listData.runs).toHaveLength(1); expect(listData.runs[0]).toMatchObject({ @@ -713,6 +721,7 @@ describe('serve app', () => { source: 'remote', experiment: 'green-uat', pass_rate: 1, + avg_score: 1, }); const detailRes = await app.request( @@ -749,6 +758,8 @@ describe('serve app', () => { const runManifestPath = path.join( cloneDir, + '.agentv', + 'results', 'runs', 'external-sync', '2026-03-26T11-00-00-000Z', diff --git a/apps/studio/src/components/ExperimentDetail.tsx b/apps/studio/src/components/ExperimentDetail.tsx index 2f354afe..8a8e1996 100644 --- a/apps/studio/src/components/ExperimentDetail.tsx +++ b/apps/studio/src/components/ExperimentDetail.tsx @@ -13,6 +13,7 @@ import { projectRunListOptions, runListOptions, } from '~/lib/api'; +import { dedupeSyncedRuns } from '~/lib/run-dedupe'; import { RunList } from './RunList'; @@ -45,12 +46,12 @@ export function ExperimentDetail({ experimentName, projectId }: ExperimentDetail } const experiment = experimentsData?.experiments?.find((entry) => entry.name === experimentName); - const runs = (runListData?.runs ?? []).filter( - (run) => (run.experiment ?? 'default') === experimentName, + const runs = dedupeSyncedRuns( + (runListData?.runs ?? []).filter((run) => (run.experiment ?? 'default') === experimentName), ); const passRate = experiment?.pass_rate ?? 0; - const runCount = experiment?.run_count ?? runs.length; + const runCount = runs.length; const targetCount = experiment?.target_count ?? 0; return ( diff --git a/apps/studio/src/components/TargetsTab.tsx b/apps/studio/src/components/TargetsTab.tsx index 66540fe9..5d0c4f68 100644 --- a/apps/studio/src/components/TargetsTab.tsx +++ b/apps/studio/src/components/TargetsTab.tsx @@ -15,6 +15,7 @@ import { runListOptions, targetsOptions, } from '~/lib/api'; +import { dedupeSyncedRuns } from '~/lib/run-dedupe'; import type { RunMeta, TargetsResponse } from '~/lib/types'; import { PassRatePill } from './PassRatePill'; @@ -68,7 +69,7 @@ export function TargetsTab({ projectId }: TargetsTabProps = {}) { } return [...groups.entries()] - .map(([name, experimentRuns]) => buildExperimentGroup(name, experimentRuns)) + .map(([name, experimentRuns]) => buildExperimentGroup(name, dedupeSyncedRuns(experimentRuns))) .sort((a, b) => { if (a.latestTimestamp && b.latestTimestamp && a.latestTimestamp !== b.latestTimestamp) { return b.latestTimestamp.localeCompare(a.latestTimestamp); diff --git a/apps/studio/src/lib/run-dedupe.test.ts b/apps/studio/src/lib/run-dedupe.test.ts new file mode 100644 index 00000000..b143d4f0 --- /dev/null +++ b/apps/studio/src/lib/run-dedupe.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, it } from 'bun:test'; + +import { dedupeSyncedRuns } from './run-dedupe'; +import type { RunMeta } from './types'; + +function run(filename: string, source: RunMeta['source']): RunMeta { + return { + filename, + display_name: filename, + path: `/tmp/${filename}`, + timestamp: '2026-05-28T08:21:09.063Z', + test_count: 8, + pass_rate: 1, + avg_score: 1, + size_bytes: 1024, + source, + }; +} + +describe('dedupeSyncedRuns', () => { + it('collapses local and remote copies of the same run in all-runs views', () => { + const runs = [ + run('remote::2026-05-28T08-21-09-063Z', 'remote'), + run('2026-05-28T08-21-09-063Z', 'local'), + run('remote::2026-05-27T08-21-09-063Z', 'remote'), + ]; + + expect(dedupeSyncedRuns(runs).map((r) => r.filename)).toEqual([ + '2026-05-28T08-21-09-063Z', + 'remote::2026-05-27T08-21-09-063Z', + ]); + }); +}); diff --git a/apps/studio/src/lib/run-dedupe.ts b/apps/studio/src/lib/run-dedupe.ts new file mode 100644 index 00000000..73ef7818 --- /dev/null +++ b/apps/studio/src/lib/run-dedupe.ts @@ -0,0 +1,23 @@ +import type { RunMeta } from './types'; + +const REMOTE_RUN_PREFIX = 'remote::'; + +function canonicalRunId(filename: string): string { + return filename.startsWith(REMOTE_RUN_PREFIX) + ? filename.slice(REMOTE_RUN_PREFIX.length) + : filename; +} + +export function dedupeSyncedRuns(runs: readonly RunMeta[]): RunMeta[] { + const byRunId = new Map(); + + for (const run of runs) { + const key = canonicalRunId(run.filename); + const existing = byRunId.get(key); + if (!existing || (existing.source === 'remote' && run.source === 'local')) { + byRunId.set(key, run); + } + } + + return [...byRunId.values()]; +} diff --git a/apps/studio/src/routes/index.tsx b/apps/studio/src/routes/index.tsx index 38603edc..0fe75565 100644 --- a/apps/studio/src/routes/index.tsx +++ b/apps/studio/src/routes/index.tsx @@ -33,6 +33,7 @@ import { resolveIndexRoute, resolveInitialProjectRedirect, } from '~/lib/navigation'; +import { dedupeSyncedRuns } from '~/lib/run-dedupe'; import type { RunMeta } from '~/lib/types'; type TabId = StudioTabId; @@ -234,7 +235,7 @@ function SingleProjectHome() { const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'experiments'; const filteredRuns = sourceFilter === 'all' - ? (data?.runs ?? []) + ? dedupeSyncedRuns(data?.runs ?? []) : (data?.runs ?? []).filter((run) => run.source === sourceFilter); async function handleSyncRemote() { diff --git a/apps/studio/src/routes/projects/$projectId.tsx b/apps/studio/src/routes/projects/$projectId.tsx index 62154143..f83b9379 100644 --- a/apps/studio/src/routes/projects/$projectId.tsx +++ b/apps/studio/src/routes/projects/$projectId.tsx @@ -22,6 +22,7 @@ import { useRemoteStatus, useStudioConfig, } from '~/lib/api'; +import { dedupeSyncedRuns } from '~/lib/run-dedupe'; type TabId = 'runs' | 'experiments' | 'analytics' | 'targets'; @@ -121,7 +122,7 @@ function ProjectRunsTab({ projectId }: { projectId: string }) { const filteredRuns = sourceFilter === 'all' - ? (data?.runs ?? []) + ? dedupeSyncedRuns(data?.runs ?? []) : (data?.runs ?? []).filter((run) => run.source === sourceFilter); async function handleSyncRemote() { diff --git a/packages/core/src/evaluation/results-repo.ts b/packages/core/src/evaluation/results-repo.ts index 9e544e77..deb69aa9 100644 --- a/packages/core/src/evaluation/results-repo.ts +++ b/packages/core/src/evaluation/results-repo.ts @@ -17,6 +17,8 @@ import { getAgentvHome } from '../paths.js'; import type { ResultsConfig } from './loaders/config-loader.js'; const execFileAsync = promisify(execFile); +const RESULTS_REPO_RESULTS_DIR = '.agentv/results'; +const RESULTS_REPO_RUNS_DIR = `${RESULTS_REPO_RESULTS_DIR}/runs`; export interface ResultsRepoLocalPaths { readonly rootDir: string; @@ -345,7 +347,7 @@ export async function stageResultsArtifacts(params: { export function resolveResultsRepoRunsDir(config: ResultsConfig): string { const normalized = normalizeResultsConfig(config); - return path.join(normalized.path, 'runs'); + return path.join(normalized.path, RESULTS_REPO_RESULTS_DIR, 'runs'); } export async function directorySizeBytes(targetPath: string): Promise { @@ -443,7 +445,12 @@ export async function directPushResults(params: { const baseBranch = await resolveDefaultBranch(repoDir); await fetchResultsRepo(repoDir); - const destinationDir = path.join(repoDir, 'runs', params.destinationPath); + const destinationDir = path.join( + repoDir, + RESULTS_REPO_RESULTS_DIR, + 'runs', + params.destinationPath, + ); await stageResultsArtifacts({ repoDir, sourceDir: params.sourceDir, @@ -655,9 +662,12 @@ function parseGitBatchBlobs(output: Buffer): GitBatchBlob[] { } export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise { - const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, 'runs'], { - cwd: repoDir, - }); + const { stdout: treeOut } = await runGit( + ['ls-tree', '-r', '--name-only', ref, RESULTS_REPO_RUNS_DIR], + { + cwd: repoDir, + }, + ); const benchmarkPaths = treeOut .split(/\r?\n/) @@ -679,7 +689,7 @@ export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise const benchmarkPath = benchmarkPaths[index]; const benchmark = JSON.parse(blob.content.toString('utf8')) as GitRunBenchmark; const runDir = path.posix.dirname(benchmarkPath); - const relativeRunPath = path.posix.relative('runs', runDir); + const relativeRunPath = path.posix.relative(RESULTS_REPO_RUNS_DIR, runDir); const runId = buildGitRunId(relativeRunPath); const timestamp = benchmark.metadata?.timestamp?.trim() || path.posix.basename(runDir); const targets = benchmark.metadata?.targets ?? []; @@ -712,7 +722,7 @@ export async function materializeGitRun( ref = 'origin/main', ): Promise { const normalizedRunPath = relativeRunPath.split(path.sep).join('/'); - const runTreePath = path.posix.join('runs', normalizedRunPath); + const runTreePath = path.posix.join(RESULTS_REPO_RUNS_DIR, normalizedRunPath); const targetRunDir = path.join(repoDir, ...runTreePath.split('/')); const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, runTreePath], { cwd: repoDir, diff --git a/packages/core/test/evaluation/results-repo.test.ts b/packages/core/test/evaluation/results-repo.test.ts index a3507499..1e963eb3 100644 --- a/packages/core/test/evaluation/results-repo.test.ts +++ b/packages/core/test/evaluation/results-repo.test.ts @@ -98,7 +98,14 @@ describe('listGitRuns', () => { }); it('returns committed runs derived from benchmark.json blobs', async () => { - const defaultRunDir = path.join(repoDir, 'runs', 'default', '2026-05-20T10-00-00-000Z'); + const defaultRunDir = path.join( + repoDir, + '.agentv', + 'results', + 'runs', + 'default', + '2026-05-20T10-00-00-000Z', + ); mkdirSync(defaultRunDir, { recursive: true }); writeFileSync( path.join(defaultRunDir, 'benchmark.json'), @@ -120,7 +127,14 @@ describe('listGitRuns', () => { ), ); - const experimentRunDir = path.join(repoDir, 'runs', 'with-skills', '2026-05-21T11-00-00-000Z'); + const experimentRunDir = path.join( + repoDir, + '.agentv', + 'results', + 'runs', + 'with-skills', + '2026-05-21T11-00-00-000Z', + ); mkdirSync(experimentRunDir, { recursive: true }); writeFileSync( path.join(experimentRunDir, 'benchmark.json'), @@ -146,7 +160,7 @@ describe('listGitRuns', () => { ), ); - git('git add runs && git commit -m "seed runs"', repoDir); + git('git add .agentv && git commit -m "seed runs"', repoDir); const runs = await listGitRuns(repoDir, 'HEAD'); @@ -159,8 +173,8 @@ describe('listGitRuns', () => { experiment: 'with-skills', timestamp: '2026-05-21T11:00:00.000Z', display_name: '2026-05-21T11-00-00-000Z', - manifest_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/index.jsonl', - benchmark_path: 'runs/with-skills/2026-05-21T11-00-00-000Z/benchmark.json', + manifest_path: '.agentv/results/runs/with-skills/2026-05-21T11-00-00-000Z/index.jsonl', + benchmark_path: '.agentv/results/runs/with-skills/2026-05-21T11-00-00-000Z/benchmark.json', test_count: 3, pass_rate: 0.75, avg_score: 0, @@ -169,7 +183,7 @@ describe('listGitRuns', () => { expect(runs[1]).toMatchObject({ experiment: 'default', target: 'gpt-4o', - manifest_path: 'runs/default/2026-05-20T10-00-00-000Z/index.jsonl', + manifest_path: '.agentv/results/runs/default/2026-05-20T10-00-00-000Z/index.jsonl', test_count: 2, pass_rate: 0.5, }); @@ -184,7 +198,14 @@ describe('listGitRuns', () => { }); it('ignores inherited git hook environment variables', async () => { - const runDir = path.join(repoDir, 'runs', 'default', '2026-05-20T10-00-00-000Z'); + const runDir = path.join( + repoDir, + '.agentv', + 'results', + 'runs', + 'default', + '2026-05-20T10-00-00-000Z', + ); mkdirSync(runDir, { recursive: true }); writeFileSync( path.join(runDir, 'benchmark.json'), @@ -205,7 +226,7 @@ describe('listGitRuns', () => { 2, ), ); - git('git add runs && git commit -m "seed run"', repoDir); + git('git add .agentv && git commit -m "seed run"', repoDir); const previousGitDir = process.env.GIT_DIR; const previousGitWorkTree = process.env.GIT_WORK_TREE; @@ -232,7 +253,14 @@ describe('listGitRuns', () => { }); it('materializes an entire run subtree atomically from git objects', async () => { - const runDir = path.join(repoDir, 'runs', 'with-files', '2026-05-22T10-00-00-000Z'); + const runDir = path.join( + repoDir, + '.agentv', + 'results', + 'runs', + 'with-files', + '2026-05-22T10-00-00-000Z', + ); mkdirSync(path.join(runDir, 'attachments'), { recursive: true }); writeFileSync(path.join(runDir, 'index.jsonl'), '{"test_id":"alpha"}\n'); writeFileSync( @@ -252,7 +280,7 @@ describe('listGitRuns', () => { }), ); writeFileSync(path.join(runDir, 'attachments', 'response.md'), 'hello from git\n'); - git('git add runs && git commit -m "seed run with files"', repoDir); + git('git add .agentv && git commit -m "seed run with files"', repoDir); rmSync(runDir, { recursive: true, force: true }); @@ -299,6 +327,20 @@ describe('results repo write path', () => { expect(git(`git --git-dir "${remoteDir}" log -1 --pretty=%B main`, rootDir)).toContain( `Agentv-Run: with-skills::${runTimestamp}`, ); + expect( + readFileSync( + path.join( + cloneDir, + '.agentv', + 'results', + 'runs', + 'with-skills', + runTimestamp, + 'index.jsonl', + ), + 'utf8', + ), + ).toContain('"test_id":"alpha"'); const runs = await listGitRuns(cloneDir, 'HEAD'); expect(runs).toHaveLength(1); diff --git a/scripts/deploy-example-results-repos.example.yaml b/scripts/deploy-example-results-repos.example.yaml new file mode 100644 index 00000000..21bc097d --- /dev/null +++ b/scripts/deploy-example-results-repos.example.yaml @@ -0,0 +1,19 @@ +# Example config for scripts/deploy-example-results-repos.ts. +# +# Run: +# bun scripts/deploy-example-results-repos.ts scripts/deploy-example-results-repos.example.yaml --dry-run +# +# Replace the repository names before running without --dry-run. +defaults: + clone_root: ~/agentv-example-repos + results_clone_root: ~/data/agentv-results + visibility: private + auto_push: true + branch_prefix: eval-results + create_readmes: true + +repositories: + - eval_repo: EntityProcess/example-agentv-evals + eval_results_repo: EntityProcess/example-agentv-eval-results + eval_description: Example AgentV eval definitions + eval_results_description: Shared AgentV eval results diff --git a/scripts/deploy-example-results-repos.ts b/scripts/deploy-example-results-repos.ts new file mode 100644 index 00000000..2eec8a02 --- /dev/null +++ b/scripts/deploy-example-results-repos.ts @@ -0,0 +1,547 @@ +#!/usr/bin/env bun +/** + * Create example eval repositories and wire each one to a dedicated AgentV + * eval-results repository. + * + * Usage: + * bun scripts/deploy-example-results-repos.ts scripts/deploy-example-results-repos.example.yaml + * bun scripts/deploy-example-results-repos.ts config.yaml --dry-run + */ + +import { execFile } from 'node:child_process'; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import { promisify } from 'node:util'; + +import { stringify as stringifyYaml } from 'yaml'; + +import { parseYamlValue } from '../packages/core/src/evaluation/yaml-loader.js'; + +const execFileAsync = promisify(execFile); + +type Visibility = 'private' | 'public' | 'internal'; + +interface DeploymentDefaults { + readonly clone_root: string; + readonly results_clone_root: string; + readonly visibility: Visibility; + readonly auto_push: boolean; + readonly branch_prefix?: string; + readonly create_readmes: boolean; +} + +interface RepoPairConfig { + readonly eval_repo: string; + readonly eval_results_repo: string; + readonly eval_description?: string; + readonly eval_results_description?: string; + readonly visibility?: Visibility; + readonly results_path?: string; + readonly auto_push?: boolean; + readonly branch_prefix?: string; +} + +interface DeploymentConfig { + readonly defaults: DeploymentDefaults; + readonly repositories: readonly RepoPairConfig[]; +} + +interface CliOptions { + readonly configPath: string; + readonly dryRun: boolean; + readonly push: boolean; +} + +const DEFAULTS: DeploymentDefaults = { + clone_root: '~/agentv-example-repos', + results_clone_root: '~/data/agentv-results', + visibility: 'private', + auto_push: true, + branch_prefix: 'eval-results', + create_readmes: true, +}; + +function printHelp(): void { + console.log(`Usage: bun scripts/deploy-example-results-repos.ts [--dry-run] [--no-push] + +Creates or verifies GitHub eval/eval-results repo pairs, then commits AgentV +remote-results configuration into each eval repo. + +Config shape: + defaults: + clone_root: ~/agentv-example-repos + results_clone_root: ~/data/agentv-results + visibility: private + auto_push: true + repositories: + - eval_repo: Owner/example-evals + eval_results_repo: Owner/example-eval-results +`); +} + +function parseArgs(argv: readonly string[]): CliOptions { + let configPath = ''; + let dryRun = false; + let push = true; + + for (const arg of argv) { + if (arg === '--help' || arg === '-h') { + printHelp(); + process.exit(0); + } + if (arg === '--dry-run') { + dryRun = true; + continue; + } + if (arg === '--no-push') { + push = false; + continue; + } + if (arg.startsWith('-')) { + throw new Error(`Unknown option: ${arg}`); + } + if (configPath) { + throw new Error(`Unexpected extra argument: ${arg}`); + } + configPath = arg; + } + + if (!configPath) { + throw new Error('Missing config path. Pass --help for usage.'); + } + + return { configPath, dryRun, push }; +} + +function isObject(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function expandHome(value: string): string { + if (value === '~' || value.startsWith('~/')) { + return path.join(os.homedir(), value.slice(1)); + } + return value; +} + +function repoSlug(repo: string): string { + return repo + .trim() + .replace(/[^A-Za-z0-9._-]+/g, '-') + .toLowerCase(); +} + +function assertRepoName(value: string, field: string): void { + if (!/^[A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+$/.test(value)) { + throw new Error(`${field} must be GitHub owner/name, got: ${value}`); + } +} + +function parseVisibility(value: unknown, fallback: Visibility): Visibility { + if (value === undefined) return fallback; + if (value === 'private' || value === 'public' || value === 'internal') return value; + throw new Error(`visibility must be private, public, or internal; got: ${String(value)}`); +} + +function parseBoolean(value: unknown, fallback: boolean, field: string): boolean { + if (value === undefined) return fallback; + if (typeof value === 'boolean') return value; + throw new Error(`${field} must be a boolean`); +} + +function parseString(value: unknown, fallback: string, field: string): string { + if (value === undefined) return fallback; + if (typeof value === 'string' && value.trim().length > 0) return value; + throw new Error(`${field} must be a non-empty string`); +} + +function optionalString(value: unknown, field: string): string | undefined { + if (value === undefined) return undefined; + if (typeof value === 'string' && value.trim().length > 0) return value; + throw new Error(`${field} must be a non-empty string when provided`); +} + +function loadDeploymentConfig(configPath: string): DeploymentConfig { + const raw = readFileSync(configPath, 'utf8'); + const parsed = parseYamlValue(raw); + if (!isObject(parsed)) { + throw new Error(`${configPath} must contain a YAML mapping`); + } + + const rawDefaults = isObject(parsed.defaults) ? parsed.defaults : {}; + const defaults: DeploymentDefaults = { + clone_root: parseString(rawDefaults.clone_root, DEFAULTS.clone_root, 'defaults.clone_root'), + results_clone_root: parseString( + rawDefaults.results_clone_root, + DEFAULTS.results_clone_root, + 'defaults.results_clone_root', + ), + visibility: parseVisibility(rawDefaults.visibility, DEFAULTS.visibility), + auto_push: parseBoolean(rawDefaults.auto_push, DEFAULTS.auto_push, 'defaults.auto_push'), + branch_prefix: optionalString(rawDefaults.branch_prefix, 'defaults.branch_prefix'), + create_readmes: parseBoolean( + rawDefaults.create_readmes, + DEFAULTS.create_readmes, + 'defaults.create_readmes', + ), + }; + + if (!Array.isArray(parsed.repositories) || parsed.repositories.length === 0) { + throw new Error('repositories must be a non-empty array'); + } + + const repositories = parsed.repositories.map((entry, index): RepoPairConfig => { + if (!isObject(entry)) { + throw new Error(`repositories[${index}] must be a mapping`); + } + const evalRepo = parseString(entry.eval_repo, '', `repositories[${index}].eval_repo`); + const evalResultsRepo = parseString( + entry.eval_results_repo, + '', + `repositories[${index}].eval_results_repo`, + ); + assertRepoName(evalRepo, `repositories[${index}].eval_repo`); + assertRepoName(evalResultsRepo, `repositories[${index}].eval_results_repo`); + + return { + eval_repo: evalRepo, + eval_results_repo: evalResultsRepo, + eval_description: optionalString( + entry.eval_description, + `repositories[${index}].eval_description`, + ), + eval_results_description: optionalString( + entry.eval_results_description, + `repositories[${index}].eval_results_description`, + ), + visibility: parseVisibility(entry.visibility, defaults.visibility), + results_path: optionalString(entry.results_path, `repositories[${index}].results_path`), + auto_push: parseBoolean( + entry.auto_push, + defaults.auto_push, + `repositories[${index}].auto_push`, + ), + branch_prefix: optionalString(entry.branch_prefix, `repositories[${index}].branch_prefix`), + }; + }); + + return { defaults, repositories }; +} + +async function run( + command: string, + args: readonly string[], + options: { readonly cwd?: string; readonly dryRun?: boolean; readonly check?: boolean } = {}, +): Promise<{ stdout: string; stderr: string }> { + const printable = [command, ...args].join(' '); + if (options.dryRun) { + console.log(`[dry-run] ${options.cwd ? `(${options.cwd}) ` : ''}${printable}`); + return { stdout: '', stderr: '' }; + } + + try { + return await execFileAsync(command, [...args], { + cwd: options.cwd, + env: process.env, + maxBuffer: 10 * 1024 * 1024, + }); + } catch (error) { + if (options.check === false && error && typeof error === 'object') { + const execError = error as { stdout?: string; stderr?: string }; + return { + stdout: execError.stdout ?? '', + stderr: execError.stderr ?? '', + }; + } + throw error; + } +} + +async function repoExists(repo: string): Promise { + const result = await run('gh', ['repo', 'view', repo, '--json', 'nameWithOwner'], { + check: false, + }); + return result.stdout.trim().length > 0; +} + +async function ensureGitHubRepo(params: { + readonly repo: string; + readonly description: string; + readonly visibility: Visibility; + readonly dryRun: boolean; +}): Promise { + if (!params.dryRun && (await repoExists(params.repo))) { + console.log(`Repo exists: ${params.repo}`); + return; + } + + const visibilityFlag = `--${params.visibility}`; + await run( + 'gh', + [ + 'repo', + 'create', + params.repo, + visibilityFlag, + '--description', + params.description, + '--clone=false', + ], + { dryRun: params.dryRun }, + ); + console.log(`Created repo: ${params.repo}`); +} + +async function cloneOrUpdateRepo(params: { + readonly repo: string; + readonly cloneRoot: string; + readonly dryRun: boolean; +}): Promise { + const repoDir = path.join(expandHome(params.cloneRoot), repoSlug(params.repo)); + if (!existsSync(repoDir)) { + mkdirSync(path.dirname(repoDir), { recursive: true }); + await run('gh', ['repo', 'clone', params.repo, repoDir], { dryRun: params.dryRun }); + return repoDir; + } + + if (!existsSync(path.join(repoDir, '.git'))) { + throw new Error(`Clone path exists but is not a git repo: ${repoDir}`); + } + + await run('git', ['fetch', 'origin', '--prune'], { cwd: repoDir, dryRun: params.dryRun }); + await run('git', ['pull', '--ff-only'], { cwd: repoDir, dryRun: params.dryRun, check: false }); + return repoDir; +} + +async function assertClean(repoDir: string, dryRun: boolean): Promise { + if (dryRun) return; + const { stdout } = await run('git', ['status', '--porcelain'], { cwd: repoDir }); + if (stdout.trim().length > 0) { + throw new Error(`Refusing to edit dirty checkout: ${repoDir}`); + } +} + +function writeFileIfChanged(filePath: string, content: string, dryRun: boolean): boolean { + const current = existsSync(filePath) ? readFileSync(filePath, 'utf8') : undefined; + if (current === content) { + return false; + } + + if (dryRun) { + console.log(`[dry-run] write ${filePath}`); + return true; + } + + mkdirSync(path.dirname(filePath), { recursive: true }); + writeFileSync(filePath, content, 'utf8'); + return true; +} + +async function commitAndMaybePush(params: { + readonly repoDir: string; + readonly message: string; + readonly push: boolean; + readonly dryRun: boolean; +}): Promise { + await run('git', ['add', '--all'], { cwd: params.repoDir, dryRun: params.dryRun }); + const { stdout } = await run('git', ['status', '--porcelain'], { + cwd: params.repoDir, + dryRun: params.dryRun, + }); + if (!params.dryRun && stdout.trim().length === 0) { + console.log(`No changes in ${params.repoDir}`); + return; + } + + let branch = 'main'; + if (!params.dryRun) { + const branchResult = await run('git', ['branch', '--show-current'], { cwd: params.repoDir }); + branch = branchResult.stdout.trim() || 'main'; + if (!branchResult.stdout.trim()) { + await run('git', ['checkout', '-B', branch], { cwd: params.repoDir }); + } + } + + await run('git', ['commit', '-m', params.message], { + cwd: params.repoDir, + dryRun: params.dryRun, + }); + + if (params.push) { + await run('git', ['push', '-u', 'origin', branch], { + cwd: params.repoDir, + dryRun: params.dryRun, + }); + } +} + +function readYamlObject(filePath: string): Record { + if (!existsSync(filePath)) { + return {}; + } + const parsed = parseYamlValue(readFileSync(filePath, 'utf8')); + if (parsed === null || parsed === undefined) { + return {}; + } + if (!isObject(parsed)) { + throw new Error(`${filePath} must contain a YAML mapping`); + } + return parsed; +} + +function configureEvalRepo(params: { + readonly repoDir: string; + readonly evalRepo: string; + readonly resultsRepo: string; + readonly resultsPath: string; + readonly autoPush: boolean; + readonly branchPrefix?: string; + readonly createReadmes: boolean; + readonly dryRun: boolean; +}): boolean { + let changed = false; + + if (params.createReadmes) { + const readme = `# ${params.evalRepo} + +Example AgentV eval repository. + +This repo is configured to push evaluation results to \`${params.resultsRepo}\`. +`; + changed = + writeFileIfChanged(path.join(params.repoDir, 'README.md'), readme, params.dryRun) || changed; + } + + const configPath = path.join(params.repoDir, '.agentv', 'config.yaml'); + const config = readYamlObject(configPath); + config.results = { + mode: 'github', + repo: params.resultsRepo, + path: params.resultsPath, + auto_push: params.autoPush, + ...(params.branchPrefix ? { branch_prefix: params.branchPrefix } : {}), + }; + + const nextConfig = stringifyYaml(config, { lineWidth: 100 }); + changed = writeFileIfChanged(configPath, nextConfig, params.dryRun) || changed; + + return changed; +} + +function initializeResultsRepo(params: { + readonly repoDir: string; + readonly resultsRepo: string; + readonly createReadmes: boolean; + readonly dryRun: boolean; +}): boolean { + let changed = false; + + if (params.createReadmes) { + const readme = `# ${params.resultsRepo} + +Shared AgentV evaluation results repository. + +AgentV stores run artifacts under \`.agentv/results/runs/\` and syncs them through normal git fetch/push operations. +`; + changed = + writeFileIfChanged(path.join(params.repoDir, 'README.md'), readme, params.dryRun) || changed; + } + + changed = + writeFileIfChanged( + path.join(params.repoDir, '.agentv', 'results', 'runs', '.gitkeep'), + '', + params.dryRun, + ) || changed; + + return changed; +} + +async function deployPair( + defaults: DeploymentDefaults, + pair: RepoPairConfig, + options: Pick, +): Promise { + const visibility = pair.visibility ?? defaults.visibility; + const autoPush = pair.auto_push ?? defaults.auto_push; + const branchPrefix = pair.branch_prefix ?? defaults.branch_prefix; + const resultsPath = + pair.results_path ?? path.join(defaults.results_clone_root, repoSlug(pair.eval_results_repo)); + + console.log(`\n== ${pair.eval_repo} -> ${pair.eval_results_repo} ==`); + + await ensureGitHubRepo({ + repo: pair.eval_repo, + description: pair.eval_description ?? 'Example AgentV eval definitions', + visibility, + dryRun: options.dryRun, + }); + await ensureGitHubRepo({ + repo: pair.eval_results_repo, + description: pair.eval_results_description ?? 'Shared AgentV eval results', + visibility, + dryRun: options.dryRun, + }); + + const evalRepoDir = await cloneOrUpdateRepo({ + repo: pair.eval_repo, + cloneRoot: defaults.clone_root, + dryRun: options.dryRun, + }); + const resultsRepoDir = await cloneOrUpdateRepo({ + repo: pair.eval_results_repo, + cloneRoot: defaults.clone_root, + dryRun: options.dryRun, + }); + + await assertClean(evalRepoDir, options.dryRun); + await assertClean(resultsRepoDir, options.dryRun); + + initializeResultsRepo({ + repoDir: resultsRepoDir, + resultsRepo: pair.eval_results_repo, + createReadmes: defaults.create_readmes, + dryRun: options.dryRun, + }); + await commitAndMaybePush({ + repoDir: resultsRepoDir, + message: 'chore: initialize AgentV results repo', + push: options.push, + dryRun: options.dryRun, + }); + + configureEvalRepo({ + repoDir: evalRepoDir, + evalRepo: pair.eval_repo, + resultsRepo: pair.eval_results_repo, + resultsPath, + autoPush, + branchPrefix, + createReadmes: defaults.create_readmes, + dryRun: options.dryRun, + }); + await commitAndMaybePush({ + repoDir: evalRepoDir, + message: 'chore: configure AgentV remote results', + push: options.push, + dryRun: options.dryRun, + }); +} + +async function main(): Promise { + const options = parseArgs(process.argv.slice(2)); + const configPath = path.resolve(options.configPath); + const config = loadDeploymentConfig(configPath); + + for (const pair of config.repositories) { + await deployPair(config.defaults, pair, options); + } + + console.log('\nDeployment complete.'); +} + +main().catch((error) => { + console.error(error instanceof Error ? error.message : error); + process.exit(1); +});