Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion apps/cli/src/commands/results/remote.ts
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,10 @@ export async function ensureRemoteRunAvailable(
throw new Error(`Remote manifest path is outside the results repo clone: ${meta.path}`);
}

const relativeRunPath = path.posix.relative('runs', path.posix.dirname(relativeManifestPath));
const relativeRunPath = path.posix.relative(
'.agentv/results/runs',
path.posix.dirname(relativeManifestPath),
);
await materializeGitRun(config.path, relativeRunPath);
}

Expand Down
4 changes: 3 additions & 1 deletion apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -376,12 +376,14 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
let target: string | undefined;
let experiment = inferExperimentFromRunId(m.raw_filename);
let passRate = m.passRate;
let avgScore = m.avgScore;
try {
const records = await loadLightweightResultsForMeta(searchDir, m);
if (records.length > 0) {
target = records[0].target;
experiment = records[0].experiment ?? experiment;
passRate = records.filter((r) => r.score >= passThreshold).length / records.length;
avgScore = records.reduce((sum, r) => sum + r.score, 0) / records.length;
} else {
// Run is in-progress with 0 results written yet — fall back to the
// in-memory target stored when the Studio launched this run.
Expand All @@ -402,7 +404,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
timestamp: m.timestamp,
test_count: m.testCount,
pass_rate: passRate,
avg_score: m.avgScore,
avg_score: avgScore,
size_bytes: m.sizeBytes,
source: m.source,
...(target && { target }),
Expand Down
15 changes: 13 additions & 2 deletions apps/cli/test/commands/results/serve.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ function writeRemoteRunArtifact(
/^(\d{4}-\d{2}-\d{2})T(\d{2})-(\d{2})-(\d{2})-(\d{3})Z$/,
'$1T$2:$3:$4.$5Z',
);
const runDir = path.join(cloneDir, 'runs', experiment, timestamp);
const runDir = path.join(cloneDir, '.agentv', 'results', 'runs', experiment, timestamp);
mkdirSync(runDir, { recursive: true });
writeFileSync(path.join(runDir, 'index.jsonl'), toJsonl(resultRecord));
writeFileSync(
Expand Down Expand Up @@ -653,6 +653,8 @@ describe('serve app', () => {
process.env.AGENTV_HOME,
'results',
'EntityProcess-agentv-evals',
'.agentv',
'results',
'runs',
'default',
'2026-03-26T10-00-00-000Z',
Expand Down Expand Up @@ -705,14 +707,21 @@ describe('serve app', () => {
const listRes = await app.request('/api/runs');
expect(listRes.status).toBe(200);
const listData = (await listRes.json()) as {
runs: Array<{ filename: string; source: string; experiment?: string; pass_rate?: number }>;
runs: Array<{
filename: string;
source: string;
experiment?: string;
pass_rate?: number;
avg_score?: number;
}>;
};
expect(listData.runs).toHaveLength(1);
expect(listData.runs[0]).toMatchObject({
filename: `remote::${runId}`,
source: 'remote',
experiment: 'green-uat',
pass_rate: 1,
avg_score: 1,
});

const detailRes = await app.request(
Expand Down Expand Up @@ -749,6 +758,8 @@ describe('serve app', () => {

const runManifestPath = path.join(
cloneDir,
'.agentv',
'results',
'runs',
'external-sync',
'2026-03-26T11-00-00-000Z',
Expand Down
7 changes: 4 additions & 3 deletions apps/studio/src/components/ExperimentDetail.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
projectRunListOptions,
runListOptions,
} from '~/lib/api';
import { dedupeSyncedRuns } from '~/lib/run-dedupe';

import { RunList } from './RunList';

Expand Down Expand Up @@ -45,12 +46,12 @@ export function ExperimentDetail({ experimentName, projectId }: ExperimentDetail
}

const experiment = experimentsData?.experiments?.find((entry) => entry.name === experimentName);
const runs = (runListData?.runs ?? []).filter(
(run) => (run.experiment ?? 'default') === experimentName,
const runs = dedupeSyncedRuns(
(runListData?.runs ?? []).filter((run) => (run.experiment ?? 'default') === experimentName),
);

const passRate = experiment?.pass_rate ?? 0;
const runCount = experiment?.run_count ?? runs.length;
const runCount = runs.length;
const targetCount = experiment?.target_count ?? 0;

return (
Expand Down
3 changes: 2 additions & 1 deletion apps/studio/src/components/TargetsTab.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
runListOptions,
targetsOptions,
} from '~/lib/api';
import { dedupeSyncedRuns } from '~/lib/run-dedupe';
import type { RunMeta, TargetsResponse } from '~/lib/types';

import { PassRatePill } from './PassRatePill';
Expand Down Expand Up @@ -68,7 +69,7 @@ export function TargetsTab({ projectId }: TargetsTabProps = {}) {
}

return [...groups.entries()]
.map(([name, experimentRuns]) => buildExperimentGroup(name, experimentRuns))
.map(([name, experimentRuns]) => buildExperimentGroup(name, dedupeSyncedRuns(experimentRuns)))
.sort((a, b) => {
if (a.latestTimestamp && b.latestTimestamp && a.latestTimestamp !== b.latestTimestamp) {
return b.latestTimestamp.localeCompare(a.latestTimestamp);
Expand Down
33 changes: 33 additions & 0 deletions apps/studio/src/lib/run-dedupe.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { describe, expect, it } from 'bun:test';

import { dedupeSyncedRuns } from './run-dedupe';
import type { RunMeta } from './types';

function run(filename: string, source: RunMeta['source']): RunMeta {
return {
filename,
display_name: filename,
path: `/tmp/${filename}`,
timestamp: '2026-05-28T08:21:09.063Z',
test_count: 8,
pass_rate: 1,
avg_score: 1,
size_bytes: 1024,
source,
};
}

describe('dedupeSyncedRuns', () => {
it('collapses local and remote copies of the same run in all-runs views', () => {
const runs = [
run('remote::2026-05-28T08-21-09-063Z', 'remote'),
run('2026-05-28T08-21-09-063Z', 'local'),
run('remote::2026-05-27T08-21-09-063Z', 'remote'),
];

expect(dedupeSyncedRuns(runs).map((r) => r.filename)).toEqual([
'2026-05-28T08-21-09-063Z',
'remote::2026-05-27T08-21-09-063Z',
]);
});
});
23 changes: 23 additions & 0 deletions apps/studio/src/lib/run-dedupe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import type { RunMeta } from './types';

const REMOTE_RUN_PREFIX = 'remote::';

function canonicalRunId(filename: string): string {
return filename.startsWith(REMOTE_RUN_PREFIX)
? filename.slice(REMOTE_RUN_PREFIX.length)
: filename;
}

export function dedupeSyncedRuns(runs: readonly RunMeta[]): RunMeta[] {
const byRunId = new Map<string, RunMeta>();

for (const run of runs) {
const key = canonicalRunId(run.filename);
const existing = byRunId.get(key);
if (!existing || (existing.source === 'remote' && run.source === 'local')) {
byRunId.set(key, run);
}
}

return [...byRunId.values()];
}
3 changes: 2 additions & 1 deletion apps/studio/src/routes/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
resolveIndexRoute,
resolveInitialProjectRedirect,
} from '~/lib/navigation';
import { dedupeSyncedRuns } from '~/lib/run-dedupe';
import type { RunMeta } from '~/lib/types';
type TabId = StudioTabId;

Expand Down Expand Up @@ -234,7 +235,7 @@ function SingleProjectHome() {
const activeTab: TabId = tabs.some((t) => t.id === tab) ? (tab as TabId) : 'experiments';
const filteredRuns =
sourceFilter === 'all'
? (data?.runs ?? [])
? dedupeSyncedRuns(data?.runs ?? [])
: (data?.runs ?? []).filter((run) => run.source === sourceFilter);

async function handleSyncRemote() {
Expand Down
3 changes: 2 additions & 1 deletion apps/studio/src/routes/projects/$projectId.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
useRemoteStatus,
useStudioConfig,
} from '~/lib/api';
import { dedupeSyncedRuns } from '~/lib/run-dedupe';

type TabId = 'runs' | 'experiments' | 'analytics' | 'targets';

Expand Down Expand Up @@ -121,7 +122,7 @@ function ProjectRunsTab({ projectId }: { projectId: string }) {

const filteredRuns =
sourceFilter === 'all'
? (data?.runs ?? [])
? dedupeSyncedRuns(data?.runs ?? [])
: (data?.runs ?? []).filter((run) => run.source === sourceFilter);

async function handleSyncRemote() {
Expand Down
24 changes: 17 additions & 7 deletions packages/core/src/evaluation/results-repo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import { getAgentvHome } from '../paths.js';
import type { ResultsConfig } from './loaders/config-loader.js';

const execFileAsync = promisify(execFile);
const RESULTS_REPO_RESULTS_DIR = '.agentv/results';
const RESULTS_REPO_RUNS_DIR = `${RESULTS_REPO_RESULTS_DIR}/runs`;

export interface ResultsRepoLocalPaths {
readonly rootDir: string;
Expand Down Expand Up @@ -345,7 +347,7 @@ export async function stageResultsArtifacts(params: {

export function resolveResultsRepoRunsDir(config: ResultsConfig): string {
const normalized = normalizeResultsConfig(config);
return path.join(normalized.path, 'runs');
return path.join(normalized.path, RESULTS_REPO_RESULTS_DIR, 'runs');
}

export async function directorySizeBytes(targetPath: string): Promise<number> {
Expand Down Expand Up @@ -443,7 +445,12 @@ export async function directPushResults(params: {
const baseBranch = await resolveDefaultBranch(repoDir);
await fetchResultsRepo(repoDir);

const destinationDir = path.join(repoDir, 'runs', params.destinationPath);
const destinationDir = path.join(
repoDir,
RESULTS_REPO_RESULTS_DIR,
'runs',
params.destinationPath,
);
await stageResultsArtifacts({
repoDir,
sourceDir: params.sourceDir,
Expand Down Expand Up @@ -655,9 +662,12 @@ function parseGitBatchBlobs(output: Buffer): GitBatchBlob[] {
}

export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise<GitListedRun[]> {
const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, 'runs'], {
cwd: repoDir,
});
const { stdout: treeOut } = await runGit(
['ls-tree', '-r', '--name-only', ref, RESULTS_REPO_RUNS_DIR],
{
cwd: repoDir,
},
);

const benchmarkPaths = treeOut
.split(/\r?\n/)
Expand All @@ -679,7 +689,7 @@ export async function listGitRuns(repoDir: string, ref = 'origin/main'): Promise
const benchmarkPath = benchmarkPaths[index];
const benchmark = JSON.parse(blob.content.toString('utf8')) as GitRunBenchmark;
const runDir = path.posix.dirname(benchmarkPath);
const relativeRunPath = path.posix.relative('runs', runDir);
const relativeRunPath = path.posix.relative(RESULTS_REPO_RUNS_DIR, runDir);
const runId = buildGitRunId(relativeRunPath);
const timestamp = benchmark.metadata?.timestamp?.trim() || path.posix.basename(runDir);
const targets = benchmark.metadata?.targets ?? [];
Expand Down Expand Up @@ -712,7 +722,7 @@ export async function materializeGitRun(
ref = 'origin/main',
): Promise<void> {
const normalizedRunPath = relativeRunPath.split(path.sep).join('/');
const runTreePath = path.posix.join('runs', normalizedRunPath);
const runTreePath = path.posix.join(RESULTS_REPO_RUNS_DIR, normalizedRunPath);
const targetRunDir = path.join(repoDir, ...runTreePath.split('/'));
const { stdout: treeOut } = await runGit(['ls-tree', '-r', '--name-only', ref, runTreePath], {
cwd: repoDir,
Expand Down
Loading
Loading