diff --git a/.changeset/all-goats-double.md b/.changeset/all-goats-double.md new file mode 100644 index 000000000..02d5bd4f4 --- /dev/null +++ b/.changeset/all-goats-double.md @@ -0,0 +1,5 @@ +--- +"braintrust": minor +--- + +feat: Add reporter for `vitest-evals` diff --git a/e2e/config/pr-comment-scenarios.json b/e2e/config/pr-comment-scenarios.json index ee98b2fba..6184821ba 100644 --- a/e2e/config/pr-comment-scenarios.json +++ b/e2e/config/pr-comment-scenarios.json @@ -196,6 +196,28 @@ { "variantKey": "github-copilot-v0-auto", "label": "Auto-hook" } ] }, + { + "scenarioDirName": "test-framework-evals-vitest", + "label": "Vitest Test Framework Evals", + "metadataScenario": "test-framework-evals-vitest", + "variants": [ + { "variantKey": "v2", "label": "v2" }, + { "variantKey": "v3", "label": "v3" }, + { "variantKey": "v4.1", "label": "v4.1" }, + { + "variantKey": "vitest-evals-reporter", + "label": "vitest-evals reporter" + } + ], + "evals": [ + { + "entry": "scenario.vitest-evals-reporter.ts", + "experimentNameTemplate": "vitest-evals-reporter-{testRunId}", + "label": "vitest-evals reporter", + "variantKey": "vitest-evals-reporter" + } + ] + }, { "scenarioDirName": "openai-agents-instrumentation", "label": "OpenAI Agents Instrumentation", diff --git a/e2e/helpers/mock-braintrust-server.ts b/e2e/helpers/mock-braintrust-server.ts index 1dc9e0dc9..d6f053011 100644 --- a/e2e/helpers/mock-braintrust-server.ts +++ b/e2e/helpers/mock-braintrust-server.ts @@ -72,6 +72,19 @@ interface StartMockBraintrustServerOptions { } const DEFAULT_API_KEY = "mock-braintrust-api-key"; +const PROD_FORWARDING_SKIPPED_HEADERS = new Set([ + "authorization", + "connection", + "content-length", + "host", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailer", + "transfer-encoding", + "upgrade", +]); function isRecord(value: unknown): value is Record { return typeof value === "object" && value !== null && !Array.isArray(value); @@ -265,6 +278,7 @@ export async function startMockBraintrustServer( const events: CapturedLogEvent[] = []; const mergedRows = new Map(); const projectsByName = new Map(); + const prodForwardingErrors: string[] = []; const experimentsByProjectAndName = new Map< string, { @@ -382,18 +396,55 @@ export async function startMockBraintrustServer( return created; } - function trackProdForwarding(promise: Promise): void { + function recordProdForwardingError(context: string, error: unknown): void { + prodForwardingErrors.push( + `${context}: ${error instanceof Error ? error.message : String(error)}`, + ); + } + + function trackProdForwarding(context: string, promise: Promise): void { pendingProdForwarding.add(promise); void promise.then( () => { pendingProdForwarding.delete(promise); }, - () => { + (error) => { + recordProdForwardingError(context, error); pendingProdForwarding.delete(promise); }, ); } + function requestForProdForwarding( + capturedRequest: CapturedRequest, + ): CapturedRequest { + if (!prodForwarding || !isRecord(capturedRequest.jsonBody)) { + return capturedRequest; + } + + const jsonBody = clone(capturedRequest.jsonBody) as Record; + let changed = false; + + if ("org_id" in jsonBody) { + jsonBody.org_id = prodForwarding.orgId; + changed = true; + } + if ("org_name" in jsonBody) { + jsonBody.org_name = prodForwarding.orgName; + changed = true; + } + + if (!changed) { + return capturedRequest; + } + + return { + ...capturedRequest, + jsonBody: jsonBody as JsonValue, + rawBody: JSON.stringify(jsonBody), + }; + } + async function forwardProdRequest( capturedRequest: CapturedRequest, options: { drainResponseBody?: boolean } = {}, @@ -402,22 +453,18 @@ export async function startMockBraintrustServer( throw new Error("prodForwarding is not enabled"); } - const baseUrl = capturedRequest.path.startsWith("/api/") + const prodRequest = requestForProdForwarding(capturedRequest); + const baseUrl = prodRequest.path.startsWith("/api/") ? prodForwarding.appUrl : prodForwarding.apiUrl; - const url = new URL(capturedRequest.path, baseUrl); - for (const [key, value] of Object.entries(capturedRequest.query)) { + const url = new URL(prodRequest.path, baseUrl); + for (const [key, value] of Object.entries(prodRequest.query)) { url.searchParams.set(key, value); } const headers = new Headers(); - for (const [key, value] of Object.entries(capturedRequest.headers)) { - if ( - key === "authorization" || - key === "connection" || - key === "content-length" || - key === "host" - ) { + for (const [key, value] of Object.entries(prodRequest.headers)) { + if (PROD_FORWARDING_SKIPPED_HEADERS.has(key)) { continue; } @@ -427,16 +474,19 @@ export async function startMockBraintrustServer( const response = await fetch(url, { body: - capturedRequest.method === "GET" || capturedRequest.method === "HEAD" + prodRequest.method === "GET" || prodRequest.method === "HEAD" ? undefined - : capturedRequest.rawBody, + : prodRequest.rawBody, headers, - method: capturedRequest.method, + method: prodRequest.method, }); if (!response.ok) { + const responseText = await response.text().catch(() => ""); throw new Error( - `prodForwarding failed for ${capturedRequest.method} ${capturedRequest.path}: ${response.status} ${response.statusText}`, + `prodForwarding failed for ${capturedRequest.method} ${capturedRequest.path}: ${response.status} ${response.statusText}${ + responseText ? `: ${responseText.slice(0, 500)}` : "" + }`, ); } @@ -523,7 +573,8 @@ export async function startMockBraintrustServer( }); return; } - } catch { + } catch (error) { + recordProdForwardingError("POST /api/project/register", error); // Fall back to local registration so e2e assertions still run. } } @@ -583,7 +634,8 @@ export async function startMockBraintrustServer( }); return; } - } catch { + } catch (error) { + recordProdForwardingError("POST /api/experiment/register", error); // Fall back to local registration so e2e assertions still run. } } @@ -668,11 +720,10 @@ export async function startMockBraintrustServer( } if (prodForwarding) { trackProdForwarding( + "POST /logs3", forwardProdRequest(capturedRequest, { drainResponseBody: true, - }) - .then(() => undefined) - .catch(() => undefined), + }).then(() => undefined), ); } respondJson(res, 200, { ok: true }); @@ -685,11 +736,10 @@ export async function startMockBraintrustServer( ) { if (prodForwarding) { trackProdForwarding( + "POST /otel/v1/traces", forwardProdRequest(capturedRequest, { drainResponseBody: true, - }) - .then(() => undefined) - .catch(() => undefined), + }).then(() => undefined), ); } respondJson(res, 200, { ok: true }); @@ -723,6 +773,14 @@ export async function startMockBraintrustServer( while (pendingProdForwarding.size > 0) { await Promise.allSettled([...pendingProdForwarding]); } + if (prodForwardingErrors.length > 0) { + throw new Error( + [ + "Braintrust prod forwarding failed:", + ...prodForwardingErrors.map((message) => `- ${message}`), + ].join("\n"), + ); + } }, events, payloads, diff --git a/e2e/helpers/prod-forwarding.ts b/e2e/helpers/prod-forwarding.ts index 3c4a88e0d..522aedbe6 100644 --- a/e2e/helpers/prod-forwarding.ts +++ b/e2e/helpers/prod-forwarding.ts @@ -6,6 +6,8 @@ export interface ProdForwarding { apiKey: string; apiUrl: string; appUrl: string; + orgId: string; + orgName: string; projectId: string; projectName: string; } @@ -36,7 +38,7 @@ export async function initializeProdForwarding(): Promise { const projectId = await logger.id; const state = logger.loggingState; - if (!state.apiUrl || !state.appUrl) { + if (!state.apiUrl || !state.appUrl || !state.orgId || !state.orgName) { throw new Error("Braintrust login did not resolve prodForwarding URLs"); } @@ -44,6 +46,8 @@ export async function initializeProdForwarding(): Promise { apiKey, apiUrl: state.apiUrl, appUrl: state.appUrl, + orgId: state.orgId, + orgName: state.orgName, projectId, projectName, }; diff --git a/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.json b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.json new file mode 100644 index 000000000..7150b7c62 --- /dev/null +++ b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.json @@ -0,0 +1,98 @@ +{ + "span_tree": [ + { + "name": "vitest-evals braintrust reporter > approves refundable invoice", + "type": "eval", + "children": [ + { + "name": "classify refund", + "type": "llm", + "children": [ + { + "name": "lookupInvoice", + "type": "tool", + "children": [], + "attributes": { + "external_parent_id": "model-span", + "external_span_id": "tool-span", + "gen_ai.tool.name": "lookupInvoice", + "trace_id": "refund-trace", + "vitest_evals_kind": "tool" + }, + "metadata": { + "traceName": "refund trace" + } + } + ], + "attributes": { + "external_span_id": "model-span", + "gen_ai.request.model": "deterministic-refund-model", + "trace_id": "refund-trace", + "vitest_evals_kind": "model" + }, + "metadata": { + "traceName": "refund trace" + } + } + ], + "attributes": { + "framework": "vitest", + "reporter": "vitest-evals" + }, + "input": { + "input": "Refund invoice inv_123", + "test": "vitest-evals braintrust reporter > approves refundable invoice" + }, + "output": { + "message": "Invoice inv_123 is refundable and the refund is approved.", + "status": "approved" + }, + "scores": { + "StatusJudge": 1, + "avg_score": 1, + "pass": 1 + }, + "metadata": { + "artifacts": { + "case": "vitest-evals-reporter", + "scenario": "test-framework-evals-vitest", + "testRunId": "" + }, + "errors": [], + "failureMessages": [], + "file": "/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts", + "fullName": "vitest-evals braintrust reporter > approves refundable invoice", + "harnessName": "braintrust-refund-harness", + "relativeFile": "runner.vitest-evals-reporter.case.ts", + "scoreMetadata": { + "StatusJudge": { + "expectedStatus": "approved", + "observedStatus": "approved" + } + }, + "session": { + "messages": [ + { + "content": "Refund invoice inv_123", + "role": "user" + }, + { + "content": "Invoice inv_123 is refundable and the refund is approved.", + "role": "assistant" + } + ] + }, + "status": "passed", + "testId": "-2057137040_0_0", + "thresholdFailed": false + }, + "metrics": { + "duration_ms": 0, + "input_tokens": 11, + "output_tokens": 13, + "tool_calls": 1, + "total_tokens": 24 + } + } + ] +} diff --git a/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.txt b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.txt new file mode 100644 index 000000000..7e9d3df6c --- /dev/null +++ b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.txt @@ -0,0 +1,81 @@ +span_tree: +└── vitest-evals braintrust reporter > approves refundable invoice [eval] + attributes: { + "framework": "vitest", + "reporter": "vitest-evals" + } + input: { + "input": "Refund invoice inv_123", + "test": "vitest-evals braintrust reporter > approves refundable invoice" + } + output: { + "message": "Invoice inv_123 is refundable and the refund is approved.", + "status": "approved" + } + scores: { + "StatusJudge": 1, + "avg_score": 1, + "pass": 1 + } + metadata: { + "artifacts": { + "case": "vitest-evals-reporter", + "scenario": "test-framework-evals-vitest", + "testRunId": "" + }, + "errors": [], + "failureMessages": [], + "file": "/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts", + "fullName": "vitest-evals braintrust reporter > approves refundable invoice", + "harnessName": "braintrust-refund-harness", + "relativeFile": "runner.vitest-evals-reporter.case.ts", + "scoreMetadata": { + "StatusJudge": { + "expectedStatus": "approved", + "observedStatus": "approved" + } + }, + "session": { + "messages": [ + { + "content": "Refund invoice inv_123", + "role": "user" + }, + { + "content": "Invoice inv_123 is refundable and the refund is approved.", + "role": "assistant" + } + ] + }, + "status": "passed", + "testId": "-2057137040_0_0", + "thresholdFailed": false + } + metrics: { + "duration_ms": 0, + "input_tokens": 11, + "output_tokens": 13, + "tool_calls": 1, + "total_tokens": 24 + } + └── classify refund [llm] + attributes: { + "external_span_id": "model-span", + "gen_ai.request.model": "deterministic-refund-model", + "trace_id": "refund-trace", + "vitest_evals_kind": "model" + } + metadata: { + "traceName": "refund trace" + } + └── lookupInvoice [tool] + attributes: { + "external_parent_id": "model-span", + "external_span_id": "tool-span", + "gen_ai.tool.name": "lookupInvoice", + "trace_id": "refund-trace", + "vitest_evals_kind": "tool" + } + metadata: { + "traceName": "refund trace" + } diff --git a/e2e/scenarios/test-framework-evals-vitest/package.json b/e2e/scenarios/test-framework-evals-vitest/package.json index df55e7ecb..45a61be58 100644 --- a/e2e/scenarios/test-framework-evals-vitest/package.json +++ b/e2e/scenarios/test-framework-evals-vitest/package.json @@ -4,6 +4,7 @@ "braintrustScenario": { "canary": { "dependencies": { + "vitest": "vitest@4", "vitest-v2": "vitest@2.1.9", "vitest-v3": "vitest@3", "vitest-v4": "vitest@4" @@ -11,6 +12,9 @@ } }, "dependencies": { + "tinyrainbow": "3.1.0", + "vitest": "4.1.5", + "vitest-evals": "0.13.1", "vitest-v2": "npm:vitest@2.1.9", "vitest-v3": "npm:vitest@3.2.4", "vitest-v4": "npm:vitest@4.1.5" diff --git a/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml b/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml index af7130e4f..ffffecaaf 100644 --- a/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml +++ b/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml @@ -8,6 +8,15 @@ importers: .: dependencies: + tinyrainbow: + specifier: 3.1.0 + version: 3.1.0 + vitest: + specifier: 4.1.5 + version: 4.1.5(vite@7.3.1) + vitest-evals: + specifier: 0.13.1 + version: 0.13.1(tinyrainbow@3.1.0)(vitest@4.1.5(vite@7.3.1))(zod@4.4.3) vitest-v2: specifier: npm:vitest@2.1.9 version: vitest@2.1.9 @@ -467,6 +476,12 @@ packages: '@types/estree@1.0.8': resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==} + '@vitest-evals/core@0.13.1': + resolution: {integrity: sha512-YX5bRG+J0GCzwJiNoq7UHJVRrtqx07lF3cYUrHnvfRLrn/R5nfBkFkm9eluAYlMFbWehFw+fFIW7bPuyL+3pMg==} + + '@vitest-evals/report-ui@0.13.1': + resolution: {integrity: sha512-uA0OSe8UFhSP8i92hUNSFbdJ7Lwi0b06DVfvPb9lnEADgZrExv8IiHy9mkRuU+aMwo7zQI75ZZz1qx07XzPczA==} + '@vitest/expect@2.1.9': resolution: {integrity: sha512-UJCIkTBenHeKT1TTlKMJWy1laZewsRIzYighyYiJKZreqtdxSos/S1t+ktRMQWu2CKqaarrkeszJx1cgC5tGZw==} @@ -811,6 +826,20 @@ packages: yaml: optional: true + vitest-evals@0.13.1: + resolution: {integrity: sha512-UCA3drMFVxtYB3F/0AjQEBSp7EPc2Du2Au85kLHtQg4V6p2mpifP4m5VEfwgxVXq8UfrnsMk8SJvOB/5EiDC0g==} + hasBin: true + peerDependencies: + ai: '>=4 <7' + tinyrainbow: '>=2 <4' + vitest: '>=4 <5' + zod: '>=3 <5' + peerDependenciesMeta: + ai: + optional: true + zod: + optional: true + vitest@2.1.9: resolution: {integrity: sha512-MSmPM9REYqDGBI8439mA4mWhV5sKmDlBKWIYbA3lRb2PTHACE0mgKwA8yQ2xq9vxDTuk4iPrECBAEW2aoFXY0Q==} engines: {node: ^18.0.0 || >=20.0.0} @@ -910,6 +939,9 @@ packages: engines: {node: '>=8'} hasBin: true + zod@4.4.3: + resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==} + snapshots: '@esbuild/aix-ppc64@0.21.5': @@ -1147,6 +1179,14 @@ snapshots: '@types/estree@1.0.8': {} + '@vitest-evals/core@0.13.1': + dependencies: + zod: 4.4.3 + + '@vitest-evals/report-ui@0.13.1': + dependencies: + '@vitest-evals/core': 0.13.1 + '@vitest/expect@2.1.9': dependencies: '@vitest/spy': 2.1.9 @@ -1522,6 +1562,15 @@ snapshots: optionalDependencies: fsevents: 2.3.3 + vitest-evals@0.13.1(tinyrainbow@3.1.0)(vitest@4.1.5(vite@7.3.1))(zod@4.4.3): + dependencies: + '@vitest-evals/core': 0.13.1 + '@vitest-evals/report-ui': 0.13.1 + tinyrainbow: 3.1.0 + vitest: 4.1.5(vite@7.3.1) + optionalDependencies: + zod: 4.4.3 + vitest@2.1.9: dependencies: '@vitest/expect': 2.1.9 @@ -1623,3 +1672,5 @@ snapshots: dependencies: siginfo: 2.0.0 stackback: 0.0.2 + + zod@4.4.3: {} diff --git a/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts b/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts new file mode 100644 index 000000000..0b48fb046 --- /dev/null +++ b/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts @@ -0,0 +1,102 @@ +import { expect } from "vitest"; +import { createHarness, createJudge, describeEval } from "vitest-evals"; + +const testRunId = process.env.BRAINTRUST_E2E_RUN_ID; +if (!testRunId) { + throw new Error("BRAINTRUST_E2E_RUN_ID is not set"); +} + +type RefundOutput = { + message: string; + status: "approved" | "denied"; +}; + +const scenario = "test-framework-evals-vitest"; + +const refundHarness = createHarness({ + name: "braintrust-refund-harness", + run: async ({ input }) => ({ + artifacts: { + case: "vitest-evals-reporter", + scenario, + testRunId, + }, + messages: [ + { role: "user", content: input }, + { + role: "assistant", + content: "Invoice inv_123 is refundable and the refund is approved.", + }, + ], + output: { + message: "Invoice inv_123 is refundable and the refund is approved.", + status: "approved", + }, + toolCalls: [ + { + name: "lookupInvoice", + arguments: { invoiceId: "inv_123" }, + result: { refundable: true }, + }, + ], + traces: [ + { + id: "refund-trace", + name: "refund trace", + spans: [ + { + id: "model-span", + kind: "model", + name: "classify refund", + attributes: { + "gen_ai.request.model": "deterministic-refund-model", + }, + }, + { + id: "tool-span", + kind: "tool", + name: "lookupInvoice", + parentId: "model-span", + attributes: { + "gen_ai.tool.name": "lookupInvoice", + }, + }, + ], + }, + ], + usage: { + inputTokens: 11, + outputTokens: 13, + totalTokens: 24, + toolCalls: 1, + }, + }), +}); + +const StatusJudge = createJudge< + string, + RefundOutput, + { expectedStatus: RefundOutput["status"] } +>("StatusJudge", async ({ output, expectedStatus }) => ({ + metadata: { + expectedStatus, + observedStatus: output.status, + }, + score: output.status === expectedStatus ? 1 : 0, +})); + +describeEval( + "vitest-evals braintrust reporter", + { harness: refundHarness }, + (it) => { + it("approves refundable invoice", async ({ run }) => { + const result = await run("Refund invoice inv_123"); + + expect(result.output.status).toBe("approved"); + await expect(result).toSatisfyJudge(StatusJudge, { + expectedStatus: "approved", + threshold: 1, + }); + }); + }, +); diff --git a/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts b/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts index 969c9965b..ffb2203ab 100644 --- a/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts +++ b/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts @@ -16,12 +16,13 @@ const TIMEOUT_MS = 90_000; interface VitestScenario { entry: string; label: string; + variantKey: string; } const scenarios: VitestScenario[] = [ - { entry: "scenario.ts", label: "v2" }, - { entry: "scenario.vitest-v3.ts", label: "v3" }, - { entry: "scenario.vitest-v4.ts", label: "v4.1" }, + { entry: "scenario.ts", label: "v2", variantKey: "v2" }, + { entry: "scenario.vitest-v3.ts", label: "v3", variantKey: "v3" }, + { entry: "scenario.vitest-v4.ts", label: "v4.1", variantKey: "v4.1" }, ]; for (const scenario of scenarios) { @@ -35,6 +36,10 @@ for (const scenario of scenarios) { async ({ runScenarioDir, testRunEvents, testRunId }) => { await runScenarioDir({ entry: scenario.entry, + runContext: { + cassette: false, + variantKey: scenario.variantKey, + }, scenarioDir, timeoutMs: TIMEOUT_MS, }); @@ -113,3 +118,73 @@ for (const scenario of scenarios) { }, ); } + +test( + "test-framework-evals-vitest captures vitest-evals reporter spans", + { + timeout: TIMEOUT_MS, + }, + async () => { + await withScenarioHarness(async ({ events, runScenarioDir, testRunId }) => { + await runScenarioDir({ + entry: "scenario.vitest-evals-reporter.ts", + runContext: { + cassette: false, + variantKey: "vitest-evals-reporter", + }, + scenarioDir, + timeoutMs: TIMEOUT_MS, + }); + + const capturedEvents = events(); + const evalRoot = findLatestSpan( + capturedEvents, + "vitest-evals braintrust reporter > approves refundable invoice", + ); + const modelSpan = findLatestSpan(capturedEvents, "classify refund"); + const toolSpan = findLatestSpan(capturedEvents, "lookupInvoice"); + + expect(evalRoot).toBeDefined(); + expect(evalRoot?.span.type).toBe("eval"); + expect(evalRoot?.input).toMatchObject({ + input: "Refund invoice inv_123", + test: "vitest-evals braintrust reporter > approves refundable invoice", + }); + expect(evalRoot?.output).toMatchObject({ + status: "approved", + }); + expect(evalRoot?.scores).toMatchObject({ + StatusJudge: 1, + avg_score: 1, + pass: 1, + }); + expect(evalRoot?.metrics).toMatchObject({ + input_tokens: 11, + output_tokens: 13, + total_tokens: 24, + tool_calls: 1, + }); + expect(evalRoot?.row.metadata).toMatchObject({ + artifacts: { + case: "vitest-evals-reporter", + scenario: "test-framework-evals-vitest", + testRunId, + }, + harnessName: "braintrust-refund-harness", + status: "passed", + }); + + expect(modelSpan?.span.type).toBe("llm"); + expect(toolSpan?.span.type).toBe("tool"); + expect(toolSpan?.span.parentIds).toEqual([modelSpan?.span.id ?? ""]); + + await matchSpanTreeSnapshot( + capturedEvents, + resolveFileSnapshotPath( + import.meta.url, + "vitest-evals-reporter.span-tree.json", + ), + ); + }); + }, +); diff --git a/e2e/scenarios/test-framework-evals-vitest/scenario.vitest-evals-reporter.ts b/e2e/scenarios/test-framework-evals-vitest/scenario.vitest-evals-reporter.ts new file mode 100644 index 000000000..5c2726fb0 --- /dev/null +++ b/e2e/scenarios/test-framework-evals-vitest/scenario.vitest-evals-reporter.ts @@ -0,0 +1,49 @@ +import { createRequire } from "node:module"; +import { promises as fs } from "node:fs"; +import * as path from "node:path"; +import { resolveScenarioDir } from "../../helpers/scenario-harness"; +import { + getTestRunId, + runMain, + runNodeSubprocess, +} from "../../helpers/scenario-runtime"; + +const require = createRequire(import.meta.url); +const scenarioDir = resolveScenarioDir(import.meta.url); + +async function findVitestBin(): Promise { + const entryPath = require.resolve("vitest"); + let dir = path.dirname(entryPath); + while (dir !== path.dirname(dir)) { + const candidate = path.join(dir, "vitest.mjs"); + try { + await fs.access(candidate); + return candidate; + } catch { + // Keep walking upward. + } + dir = path.dirname(dir); + } + throw new Error("Could not find vitest.mjs"); +} + +async function main() { + const vitestCliPath = await findVitestBin(); + const testRunId = getTestRunId(); + + await runNodeSubprocess({ + args: [ + vitestCliPath, + "run", + "--config", + "vitest.runner-evals-reporter.config.mts", + ], + cwd: scenarioDir, + env: { + BRAINTRUST_E2E_RUN_ID: testRunId, + }, + timeoutMs: 60_000, + }); +} + +runMain(main); diff --git a/e2e/scenarios/test-framework-evals-vitest/vitest.runner-evals-reporter.config.mts b/e2e/scenarios/test-framework-evals-vitest/vitest.runner-evals-reporter.config.mts new file mode 100644 index 000000000..f2a1afe65 --- /dev/null +++ b/e2e/scenarios/test-framework-evals-vitest/vitest.runner-evals-reporter.config.mts @@ -0,0 +1,35 @@ +import * as path from "node:path"; +import { pathToFileURL } from "node:url"; +import { defineConfig } from "vitest/config"; + +const repoRoot = process.env.BRAINTRUST_E2E_REPO_ROOT; +if (!repoRoot) { + throw new Error("BRAINTRUST_E2E_REPO_ROOT is not set"); +} + +const testRunId = process.env.BRAINTRUST_E2E_RUN_ID; +if (!testRunId) { + throw new Error("BRAINTRUST_E2E_RUN_ID is not set"); +} + +const { default: BraintrustVitestEvalsReporter } = await import( + pathToFileURL(path.join(repoRoot, "js/dist/vitest-evals-reporter.mjs")).href +); + +export default defineConfig({ + test: { + hookTimeout: 30_000, + include: ["runner.vitest-evals-reporter.case.ts"], + reporters: [ + "vitest-evals/reporter", + new BraintrustVitestEvalsReporter({ + displaySummary: false, + experimentName: `vitest-evals-reporter-${testRunId}`, + projectName: + process.env.BRAINTRUST_E2E_PROJECT_NAME || + `e2e-vitest-evals-reporter-${testRunId}`, + }), + ], + testTimeout: 20_000, + }, +}); diff --git a/e2e/scripts/build-pr-e2e-links-comment.mjs b/e2e/scripts/build-pr-e2e-links-comment.mjs index 7478a809a..f93fe97c9 100644 --- a/e2e/scripts/build-pr-e2e-links-comment.mjs +++ b/e2e/scripts/build-pr-e2e-links-comment.mjs @@ -114,8 +114,50 @@ async function readScenarioConfig(configPath) { }; }) : []; + const evals = Array.isArray(entry.evals) + ? entry.evals.map((evalEntry, evalIndex) => { + if ( + !evalEntry || + typeof evalEntry !== "object" || + typeof evalEntry.label !== "string" || + typeof evalEntry.experimentNameTemplate !== "string" + ) { + throw new Error( + `Invalid eval at scenario index ${index}, eval index ${evalIndex} in ${configPath}`, + ); + } + + const evalLabel = evalEntry.label.trim(); + const experimentNameTemplate = + evalEntry.experimentNameTemplate.trim(); + const variantKey = + typeof evalEntry.variantKey === "string" + ? evalEntry.variantKey.trim() + : null; + const entry = + typeof evalEntry.entry === "string" ? evalEntry.entry.trim() : null; + if (!evalLabel || !experimentNameTemplate) { + throw new Error( + `Eval label/experimentNameTemplate must be non-empty at scenario index ${index}, eval index ${evalIndex} in ${configPath}`, + ); + } + if (!experimentNameTemplate.includes("{testRunId}")) { + throw new Error( + `Eval experimentNameTemplate must include {testRunId} at scenario index ${index}, eval index ${evalIndex} in ${configPath}`, + ); + } + + return { + entry: entry || null, + experimentNameTemplate, + label: evalLabel, + variantKey: variantKey || null, + }; + }) + : []; return { + evals, label: entry.label, metadataScenario: entry.metadataScenario, scenarioDirName: entry.scenarioDirName, @@ -126,8 +168,9 @@ async function readScenarioConfig(configPath) { async function readRunContextRecords(runContextDir) { const runIdsByScenarioAndVariant = new Map(); + const records = []; if (!runContextDir) { - return runIdsByScenarioAndVariant; + return { records, runIdsByScenarioAndVariant }; } const entries = await readdir(runContextDir, { withFileTypes: true }); @@ -171,6 +214,12 @@ async function readRunContextRecords(runContextDir) { typeof parsed.variantKey === "string" && parsed.variantKey.trim() ? parsed.variantKey.trim() : DEFAULT_VARIANT_KEY; + records.push({ + entry: typeof parsed.entry === "string" ? parsed.entry : null, + scenarioDirName, + testRunId: parsed.testRunId, + variantKey, + }); if (!runIdsByScenarioAndVariant.has(scenarioDirName)) { runIdsByScenarioAndVariant.set(scenarioDirName, new Map()); } @@ -183,7 +232,7 @@ async function readRunContextRecords(runContextDir) { } } - return runIdsByScenarioAndVariant; + return { records, runIdsByScenarioAndVariant }; } async function resolveOrgName() { @@ -265,6 +314,41 @@ function buildLogsUrl({ appUrl, orgName, projectName, search }) { return url.toString(); } +function buildExperimentUrl({ appUrl, orgName, projectName, experimentName }) { + return new URL( + `/app/${encodeURIComponent(orgName)}/p/${encodeURIComponent(projectName)}/experiments/${encodeURIComponent(experimentName)}`, + appUrl, + ).toString(); +} + +function observedRunIdsForEval(runContextRecords, scenario, evalConfig) { + return [ + ...new Set( + runContextRecords + .filter((record) => { + if (record.scenarioDirName !== scenario.scenarioDirName) { + return false; + } + if ( + evalConfig.variantKey && + record.variantKey !== evalConfig.variantKey + ) { + return false; + } + if (evalConfig.entry && record.entry !== evalConfig.entry) { + return false; + } + return true; + }) + .map((record) => record.testRunId), + ), + ].sort(); +} + +function experimentNameForRunId(evalConfig, testRunId) { + return evalConfig.experimentNameTemplate.replaceAll("{testRunId}", testRunId); +} + function buildCommentBody(options) { const includeCommentMarker = process.env.BRAINTRUST_E2E_INCLUDE_COMMENT_MARKER === "1"; @@ -388,6 +472,59 @@ function buildCommentBody(options) { } } + const evalConfigs = options.scenarios.flatMap((scenario) => + (scenario.evals ?? []).map((evalConfig) => ({ evalConfig, scenario })), + ); + if (evalConfigs.length > 0) { + lines.push(""); + lines.push("## E2E Braintrust Evals"); + lines.push(""); + lines.push("| Eval | Braintrust Eval | Status |"); + lines.push("| --- | --- | --- |"); + + for (const { evalConfig, scenario } of evalConfigs) { + const observedRunIds = observedRunIdsForEval( + options.runContextRecords, + scenario, + evalConfig, + ); + const rowLabel = `${scenario.label} (${evalConfig.label})`; + + if (observedRunIds.length === 0) { + lines.push(`| ${rowLabel} | N/A | Not observed in this run |`); + continue; + } + + if (!options.orgName) { + lines.push(`| ${rowLabel} | N/A | Observed (link unavailable) |`); + continue; + } + + const links = observedRunIds + .map((testRunId, index) => { + const experimentName = experimentNameForRunId(evalConfig, testRunId); + const experimentUrl = buildExperimentUrl({ + appUrl: options.appPublicUrl, + experimentName, + orgName: options.orgName, + projectName: options.projectName, + }); + const linkLabel = + observedRunIds.length === 1 + ? "Open eval" + : `Open eval ${index + 1}`; + return `[${linkLabel}](${experimentUrl})`; + }) + .join("
"); + const runCount = observedRunIds.length; + const runWord = runCount === 1 ? "run" : "runs"; + + lines.push( + `| ${rowLabel} | ${links} | Observed (${runCount} ${runWord}) |`, + ); + } + } + lines.push(""); return lines.join("\n"); } @@ -405,11 +542,12 @@ async function main() { ); } - const [scenarios, runIdsByScenarioAndVariant, orgResult] = await Promise.all([ + const [scenarios, runContext, orgResult] = await Promise.all([ readScenarioConfig(configPath), readRunContextRecords(runContextDir), resolveOrgName(), ]); + const { records: runContextRecords, runIdsByScenarioAndVariant } = runContext; const recordsFound = [...runIdsByScenarioAndVariant.values()].reduce( (count, variants) => @@ -433,6 +571,7 @@ async function main() { projectName, recordsFound, runIdsByScenarioAndVariant, + runContextRecords, scenarios, warning: orgResult.warning, }); diff --git a/js/package.json b/js/package.json index 123468198..aad229bb7 100644 --- a/js/package.json +++ b/js/package.json @@ -65,6 +65,12 @@ "require": "./dist/apply-auto-instrumentation.js", "default": "./dist/apply-auto-instrumentation.mjs" }, + "./vitest-evals-reporter": { + "types": "./dist/vitest-evals-reporter.d.ts", + "import": "./dist/vitest-evals-reporter.mjs", + "module": "./dist/vitest-evals-reporter.mjs", + "require": "./dist/vitest-evals-reporter.js" + }, "./node": { "types": "./dist/index.d.ts", "import": "./dist/index.mjs", diff --git a/js/src/wrappers/vitest-evals/reporter.test.ts b/js/src/wrappers/vitest-evals/reporter.test.ts new file mode 100644 index 000000000..8d4d3c97a --- /dev/null +++ b/js/src/wrappers/vitest-evals/reporter.test.ts @@ -0,0 +1,404 @@ +import { beforeAll, beforeEach, describe, expect, test, vi } from "vitest"; +import BraintrustVitestEvalsReporter from "./reporter"; +import { configureNode } from "../../node/config"; +import { + _exportsForTestingOnly, + type TestBackgroundLogger, +} from "../../logger"; +import * as logger from "../../logger"; + +configureNode(); + +let backgroundLogger: TestBackgroundLogger; + +beforeAll(async () => { + _exportsForTestingOnly.setInitialTestState(); + await _exportsForTestingOnly.simulateLoginForTests(); + backgroundLogger = _exportsForTestingOnly.useTestBackgroundLogger(); + + vi.spyOn(logger, "initExperiment").mockImplementation( + (projectOrOptions: string | any, options?: any) => { + const experimentOptions = + typeof projectOrOptions === "string" ? options : projectOrOptions; + const projectName = + typeof projectOrOptions === "string" + ? projectOrOptions + : (projectOrOptions.project ?? + projectOrOptions.projectId ?? + "test-project"); + + return _exportsForTestingOnly.initTestExperiment( + experimentOptions?.experiment || "test-experiment", + projectName, + ); + }, + ); +}); + +beforeEach(async () => { + await backgroundLogger.drain(); +}); + +describe("Braintrust vitest-evals reporter", () => { + test("does nothing when no eval metadata is present", async () => { + const reporter = new BraintrustVitestEvalsReporter(); + + await reporter.onTestRunEnd([ + fakeModule([fakeTest({ meta: {}, name: "plain test" })]), + ] as any); + + await backgroundLogger.flush(); + expect(await backgroundLogger.drain()).toHaveLength(0); + }); + + test("requires a project only when eval cases are reported", async () => { + const reporter = new BraintrustVitestEvalsReporter(); + + await expect( + reporter.onTestRunEnd([ + fakeModule([ + fakeTest({ + meta: { eval: { avgScore: 1 } }, + name: "eval test", + }), + ]), + ] as any), + ).rejects.toThrow("projectName or projectId"); + }); + + test("logs eval metadata, usage metrics, and normalized traces", async () => { + const reporter = new BraintrustVitestEvalsReporter({ + displaySummary: false, + experimentName: "vitest-evals-unit-test", + projectName: "vitest-evals-tests", + }); + const module = fakeModule([ + fakeTest({ + diagnostic: { duration: 125, startTime: 1_700_000_000_000 }, + fullName: "refund eval > approves refund", + location: { line: 42, column: 7 }, + meta: { + eval: { + avgScore: 0.9, + output: { status: "approved" }, + scores: [ + { + name: "FactualityJudge", + score: 0.8, + metadata: { rationale: "close enough" }, + }, + ], + }, + harness: { + name: "refund-harness", + run: { + session: { + messages: [ + { role: "user", content: "Refund invoice inv_123" }, + { + role: "assistant", + content: { status: "approved" }, + }, + ], + }, + usage: { + inputTokens: 10, + outputTokens: 15, + reasoningTokens: 2, + totalTokens: 27, + toolCalls: 1, + }, + artifacts: { invoiceId: "inv_123" }, + traces: [ + { + id: "trace-1", + name: "refund trace", + spans: [ + { + id: "model-1", + kind: "model", + name: "classify refund", + startedAt: "2026-01-01T00:00:00.000Z", + finishedAt: "2026-01-01T00:00:00.050Z", + attributes: { + "custom.attribute": "preserved", + "gen_ai.request.model": "gpt-test", + external_span_id: "user-external-span-id", + name: "user span name", + status: "user-status", + trace_id: "user-trace-id", + type: "custom", + vitest_evals_kind: "custom", + }, + }, + { + id: "tool-1", + parentId: "model-1", + kind: "tool", + name: "lookupInvoice", + attributes: { "gen_ai.tool.name": "lookupInvoice" }, + }, + ], + }, + ], + }, + }, + }, + name: "approves refund", + tags: ["refund", "happy-path"], + }), + ]); + + await reporter.onTestRunEnd([module] as any); + await backgroundLogger.flush(); + const rows = await backgroundLogger.drain(); + + const root = rows.find((row: any) => row.scores?.FactualityJudge === 0.8); + expect(root).toMatchObject({ + input: { + input: "Refund invoice inv_123", + test: "refund eval > approves refund", + }, + metrics: { + duration_ms: 125, + input_tokens: 10, + output_tokens: 15, + reasoning_tokens: 2, + total_tokens: 27, + tool_calls: 1, + }, + output: { status: "approved" }, + scores: { + avg_score: 0.9, + FactualityJudge: 0.8, + pass: 1, + }, + tags: ["refund", "happy-path"], + }); + expect(root?.metadata).toMatchObject({ + artifacts: { invoiceId: "inv_123" }, + file: "/repo/evals/refund.eval.ts", + harnessName: "refund-harness", + location: { line: 42, column: 7 }, + status: "passed", + scoreMetadata: { + FactualityJudge: { rationale: "close enough" }, + }, + }); + + const modelSpan = rows.find( + (row: any) => row.span_attributes?.name === "classify refund", + ); + const toolSpan = rows.find( + (row: any) => row.span_attributes?.name === "lookupInvoice", + ); + + expect(modelSpan?.span_attributes).toMatchObject({ + "custom.attribute": "preserved", + "gen_ai.request.model": "gpt-test", + name: "classify refund", + type: "llm", + vitest_evals_kind: "model", + trace_id: "trace-1", + external_span_id: "model-1", + }); + expect(toolSpan?.span_attributes).toMatchObject({ + type: "tool", + vitest_evals_kind: "tool", + external_parent_id: "model-1", + }); + expect(toolSpan?.span_parents).toEqual([modelSpan?.span_id]); + }); + + test("logs failed eval scores and failure metadata", async () => { + const reporter = new BraintrustVitestEvalsReporter({ + displaySummary: false, + projectName: "vitest-evals-tests", + }); + + await reporter.onTestRunEnd([ + fakeModule([ + fakeTest({ + meta: { + eval: { + avgScore: 0.4, + output: { status: "denied" }, + scores: [{ name: "StatusJudge", score: 0 }], + thresholdFailed: true, + }, + harness: { + run: { + errors: [{ message: "application run failed" }], + session: { + messages: [{ role: "user", content: "Refund inv_bad" }], + }, + usage: {}, + }, + }, + }, + name: "failed eval", + result: { + errors: [ + { + message: "expected score to meet threshold", + stack: "AssertionError: expected score to meet threshold", + }, + ], + state: "failed", + }, + }), + ]), + ] as any); + + await backgroundLogger.flush(); + const rows = await backgroundLogger.drain(); + const root = rows.find((row: any) => row.scores?.StatusJudge === 0); + + expect(root?.scores).toMatchObject({ + StatusJudge: 0, + avg_score: 0.4, + pass: 0, + }); + expect(root?.metadata).toMatchObject({ + errors: [{ message: "application run failed" }], + failureMessages: ["expected score to meet threshold"], + status: "failed", + thresholdFailed: true, + }); + const errorRow = rows.find((row: any) => typeof row.error === "string"); + expect(errorRow?.error).toContain("expected score to meet threshold"); + expect(errorRow?.error).toContain( + "AssertionError: expected score to meet threshold", + ); + expect(errorRow?.error).not.toContain("[object Object]"); + }); + + test("logs fallback tool spans when no normalized traces are present", async () => { + const reporter = new BraintrustVitestEvalsReporter({ + displaySummary: false, + projectId: "project-id", + }); + + await reporter.onTestRunEnd([ + fakeModule([ + fakeTest({ + meta: { + eval: { avgScore: 1 }, + harness: { + run: { + output: "done", + session: { + messages: [ + { + role: "assistant", + toolCalls: [ + { + name: "searchDocs", + arguments: { query: "refunds" }, + result: { count: 2 }, + durationMs: 12, + }, + ], + }, + ], + }, + usage: {}, + }, + }, + }, + name: "tool fallback", + }), + fakeTest({ + meta: { + eval: { + avgScore: 1, + toolCalls: [ + { + name: "lookupLegacy", + arguments: { id: "legacy" }, + result: { ok: true }, + }, + ], + }, + harness: { + run: { + output: "done", + session: { messages: [] }, + usage: {}, + }, + }, + }, + name: "eval tool fallback", + }), + ]), + ] as any); + + await backgroundLogger.flush(); + const rows = await backgroundLogger.drain(); + const toolSpan = rows.find( + (row: any) => row.span_attributes?.name === "searchDocs", + ); + const evalToolSpan = rows.find( + (row: any) => row.span_attributes?.name === "lookupLegacy", + ); + + expect(toolSpan).toMatchObject({ + input: { query: "refunds" }, + metrics: { duration_ms: 12 }, + output: { count: 2 }, + span_attributes: { type: "tool" }, + }); + expect(evalToolSpan).toMatchObject({ + input: { id: "legacy" }, + output: { ok: true }, + span_attributes: { type: "tool" }, + }); + }); +}); + +function fakeModule(tests: any[]) { + const module = { + children: { + allTests: function* () { + yield* tests; + }, + }, + moduleId: "/repo/evals/refund.eval.ts", + relativeModuleId: "evals/refund.eval.ts", + }; + + for (const test of tests) { + test.module = module; + } + + return module; +} + +function fakeTest({ + diagnostic = { duration: 50, startTime: 1_700_000_000_000 }, + fullName, + location = { line: 1, column: 1 }, + meta, + name, + result = { state: "passed" }, + tags = [], +}: { + diagnostic?: { duration: number; startTime: number }; + fullName?: string; + location?: { line: number; column: number }; + meta: Record; + name: string; + result?: { state: string; errors?: unknown[] }; + tags?: string[]; +}) { + return { + diagnostic: () => diagnostic, + fullName: fullName ?? name, + id: `test:${name}`, + location, + meta: () => meta, + name, + result: () => result, + tags, + }; +} diff --git a/js/src/wrappers/vitest-evals/reporter.ts b/js/src/wrappers/vitest-evals/reporter.ts new file mode 100644 index 000000000..e20eaea3d --- /dev/null +++ b/js/src/wrappers/vitest-evals/reporter.ts @@ -0,0 +1,637 @@ +import type { Reporter, TestCase, TestModule, Vitest } from "vitest/node"; +import { configureNode } from "../../node/config"; +import { + initExperiment, + logError, + type Experiment, + type Span, +} from "../../logger"; +import { SpanTypeAttribute, isObject } from "../../../util"; +import { summarizeAndFlush } from "../shared/flush"; + +configureNode(); + +interface BraintrustVitestEvalsReporterOptions { + projectName?: string; + projectId?: string; + experimentName?: string; + displaySummary?: boolean; + metadata?: Record; + tags?: string[]; + baseExperiment?: string; + baseExperimentId?: string; +} + +type EvalScore = { + name?: string; + score?: number | null; + metadata?: Record; +}; + +type EvalMeta = { + scores?: EvalScore[]; + avgScore?: number | null; + output?: unknown; + thresholdFailed?: boolean; + toolCalls?: ToolCallRecord[]; +}; + +type HarnessMeta = { + name?: string; + run?: HarnessRun; +}; + +type EvalTaskMeta = { + eval?: EvalMeta; + harness?: HarnessMeta; +}; + +type HarnessRun = { + output?: unknown; + session?: { + messages?: Array<{ + role?: string; + content?: unknown; + toolCalls?: ToolCallRecord[]; + metadata?: Record; + }>; + provider?: string; + model?: string; + metadata?: Record; + }; + usage?: { + inputTokens?: number; + outputTokens?: number; + reasoningTokens?: number; + totalTokens?: number; + toolCalls?: number; + retries?: number; + provider?: string; + model?: string; + metadata?: Record; + }; + timings?: Record; + artifacts?: Record; + traces?: NormalizedTrace[]; + errors?: Array>; +}; + +type ToolCallRecord = { + id?: string; + name?: string; + arguments?: unknown; + result?: unknown; + error?: unknown; + startedAt?: string; + finishedAt?: string; + durationMs?: number; + metadata?: Record; +}; + +type NormalizedTrace = { + id?: string; + name?: string; + startedAt?: string; + finishedAt?: string; + durationMs?: number; + metadata?: Record; + spans?: NormalizedSpan[]; +}; + +type NormalizedSpan = { + id?: string; + traceId?: string; + parentId?: string; + name?: string; + kind?: string; + startedAt?: string; + finishedAt?: string; + durationMs?: number; + status?: string; + error?: unknown; + attributes?: Record; + events?: unknown[]; +}; + +type TestLike = Pick< + TestCase, + | "diagnostic" + | "fullName" + | "id" + | "location" + | "meta" + | "name" + | "result" + | "tags" +> & { + module?: Pick; +}; + +type EvalTestCandidate = { + meta: EvalTaskMeta | undefined; + test: TestLike; +}; + +type RunnableEvalTest = { + meta: EvalTaskMeta; + test: TestLike; +}; + +const RESERVED_NORMALIZED_SPAN_ATTRIBUTE_KEYS = new Set([ + "name", + "type", + "vitest_evals_kind", + "trace_id", + "external_span_id", + "external_parent_id", + "status", +]); + +export default class BraintrustVitestEvalsReporter implements Reporter { + private experiment?: Experiment; + + constructor( + private readonly options: BraintrustVitestEvalsReporterOptions = {}, + ) {} + + onInit(_vitest: Vitest): void { + // Vitest calls this before a run; keeping the hook declares reporter intent + // while all data we need is available from onTestRunEnd. + } + + async onTestRunEnd(testModules: ReadonlyArray): Promise { + const evalTests: RunnableEvalTest[] = []; + for (const testModule of testModules) { + for (const test of testModule.children.allTests()) { + const candidate = { test, meta: readEvalTaskMeta(test.meta()) }; + if (isRunnableEvalTest(candidate)) { + evalTests.push(candidate); + } + } + } + + if (evalTests.length === 0) { + return; + } + + const experiment = this.getOrCreateExperiment(); + + for (const { test, meta } of evalTests) { + logEvalTest(experiment, test, meta); + } + + await summarizeAndFlush(experiment, { + displaySummary: this.options.displaySummary, + }); + this.experiment = undefined; + } + + private getOrCreateExperiment(): Experiment { + if (this.experiment) { + return this.experiment; + } + + const { projectId, projectName } = this.options; + if (!projectId && !projectName) { + throw new Error( + "Braintrust vitest-evals reporter requires projectName or projectId when eval cases are reported.", + ); + } + + const experimentName = + this.options.experimentName ?? `vitest-evals-${new Date().toISOString()}`; + + this.experiment = initExperiment({ + ...(projectId ? { projectId } : { project: projectName }), + experiment: experimentName, + metadata: this.options.metadata, + tags: this.options.tags, + baseExperiment: this.options.baseExperiment, + baseExperimentId: this.options.baseExperimentId, + }); + + return this.experiment; + } +} + +function isRunnableEvalTest( + candidate: EvalTestCandidate, +): candidate is RunnableEvalTest { + if (!candidate.meta) return false; + + const state = candidate.test.result().state; + return state !== "skipped" && state !== "pending"; +} + +function logEvalTest( + experiment: Experiment, + test: TestLike, + meta: EvalTaskMeta, +): void { + const result = test.result(); + const diagnostic = test.diagnostic(); + const run = meta.harness?.run; + const output = meta.eval?.output ?? run?.output; + const scores = buildScores(result.state, meta.eval); + const metrics = buildMetrics(diagnostic?.duration, run); + const metadata = buildMetadata(test, meta, run); + + const rootSpan = experiment.startSpan({ + name: test.fullName || test.name, + spanAttributes: { + type: SpanTypeAttribute.EVAL, + framework: "vitest", + reporter: "vitest-evals", + }, + startTime: startTimeSeconds(diagnostic), + event: { + input: { + test: test.fullName || test.name, + input: firstUserMessageContent(run), + }, + ...(output !== undefined ? { output } : {}), + scores, + metrics, + metadata, + ...(test.tags.length > 0 ? { tags: test.tags } : {}), + }, + }); + + if (result.state === "failed") { + for (const error of result.errors ?? []) { + logReporterError(rootSpan, error); + } + } + + if (run?.traces?.length) { + logNormalizedTraces(rootSpan, run.traces); + } else { + logToolCallSpans(rootSpan, toolCallsFromMeta(meta.eval, run)); + } + + rootSpan.end({ + endTime: + startTimeSeconds(diagnostic) !== undefined && + diagnostic?.duration !== undefined + ? startTimeSeconds(diagnostic)! + diagnostic.duration / 1000 + : undefined, + }); +} + +function buildScores( + state: ReturnType["state"], + evalMeta: EvalMeta | undefined, +): Record { + const scores: Record = { + pass: state === "passed" ? 1 : 0, + }; + + if (typeof evalMeta?.avgScore === "number" || evalMeta?.avgScore === null) { + scores.avg_score = evalMeta.avgScore; + } + + for (const score of evalMeta?.scores ?? []) { + if (!score.name) continue; + if (typeof score.score === "number" || score.score === null) { + scores[score.name] = score.score; + } + } + + return scores; +} + +function buildMetrics( + durationMs: number | undefined, + run: HarnessRun | undefined, +): Record { + const usage = run?.usage; + const metrics: Record = {}; + + if (durationMs !== undefined) { + metrics.duration_ms = durationMs; + } + if (typeof usage?.inputTokens === "number") { + metrics.input_tokens = usage.inputTokens; + } + if (typeof usage?.outputTokens === "number") { + metrics.output_tokens = usage.outputTokens; + } + if (typeof usage?.reasoningTokens === "number") { + metrics.reasoning_tokens = usage.reasoningTokens; + } + if (typeof usage?.totalTokens === "number") { + metrics.total_tokens = usage.totalTokens; + } + if (typeof usage?.toolCalls === "number") { + metrics.tool_calls = usage.toolCalls; + } + if (typeof usage?.retries === "number") { + metrics.retries = usage.retries; + } + + return metrics; +} + +function buildMetadata( + test: TestLike, + meta: EvalTaskMeta, + run: HarnessRun | undefined, +): Record { + const result = test.result(); + const metadata: Record = { + file: test.module?.moduleId, + relativeFile: test.module?.relativeModuleId, + fullName: test.fullName, + testId: test.id, + location: test.location, + status: result.state, + failureMessages: (result.errors ?? []).map(formatErrorMessage), + harnessName: meta.harness?.name, + thresholdFailed: meta.eval?.thresholdFailed, + session: run?.session, + artifacts: run?.artifacts, + timings: run?.timings, + errors: run?.errors, + scoreMetadata: Object.fromEntries( + (meta.eval?.scores ?? []) + .filter((score) => score.name && score.metadata) + .map((score) => [score.name!, score.metadata]), + ), + }; + + return Object.fromEntries( + Object.entries(metadata).filter(([, value]) => value !== undefined), + ); +} + +function logNormalizedTraces(rootSpan: Span, traces: NormalizedTrace[]): void { + for (const trace of traces) { + const spans = trace.spans ?? []; + const spanMap = new Map(); + const pending = [...spans]; + + while (pending.length > 0) { + const before = pending.length; + + for (let index = pending.length - 1; index >= 0; index--) { + const normalized = pending[index]; + const parent = + normalized.parentId === undefined + ? rootSpan + : spanMap.get(normalized.parentId); + + if (!parent) continue; + + const span = logNormalizedSpan(parent, normalized, trace); + if (normalized.id) { + spanMap.set(normalized.id, span); + } + pending.splice(index, 1); + } + + if (pending.length === before) { + for (const normalized of pending.splice(0)) { + const span = logNormalizedSpan(rootSpan, normalized, trace); + if (normalized.id) { + spanMap.set(normalized.id, span); + } + } + } + } + } +} + +function logNormalizedSpan( + parent: Span, + normalized: NormalizedSpan, + trace: NormalizedTrace, +): Span { + const span = parent.startSpan({ + name: normalized.name ?? normalized.kind ?? "harness span", + spanAttributes: { + ...filteredNormalizedSpanAttributes(normalized.attributes), + type: spanTypeForNormalizedKind(normalized.kind), + vitest_evals_kind: normalized.kind, + trace_id: normalized.traceId ?? trace.id, + external_span_id: normalized.id, + external_parent_id: normalized.parentId, + status: normalized.status, + }, + startTime: epochSeconds(normalized.startedAt), + }); + + const metadata: Record = { + traceName: trace.name, + traceMetadata: trace.metadata, + events: normalized.events, + }; + + if (Object.values(metadata).some((value) => value !== undefined)) { + span.log({ + metadata: Object.fromEntries( + Object.entries(metadata).filter(([, value]) => value !== undefined), + ), + }); + } + if (normalized.error !== undefined) { + logReporterError(span, normalized.error); + } + + span.end({ endTime: epochSeconds(normalized.finishedAt) }); + return span; +} + +function logToolCallSpans(rootSpan: Span, calls: ToolCallRecord[]): void { + for (const call of calls) { + if (!call.name) continue; + + const span = rootSpan.startSpan({ + name: call.name, + spanAttributes: { + type: SpanTypeAttribute.TOOL, + tool_call_id: call.id, + }, + startTime: epochSeconds(call.startedAt), + event: { + input: call.arguments, + ...(call.result !== undefined ? { output: call.result } : {}), + metadata: call.metadata, + metrics: + call.durationMs !== undefined + ? { duration_ms: call.durationMs } + : undefined, + }, + }); + + if (call.error !== undefined) { + logReporterError(span, call.error); + } + span.end({ endTime: epochSeconds(call.finishedAt) }); + } +} + +function readEvalTaskMeta(input: unknown): EvalTaskMeta | undefined { + if (!isObject(input)) return undefined; + + const evalMeta = readEvalMeta(input.eval); + const harnessMeta = readHarnessMeta(input.harness); + + if (!evalMeta && !harnessMeta) return undefined; + return { + ...(evalMeta ? { eval: evalMeta } : {}), + ...(harnessMeta ? { harness: harnessMeta } : {}), + }; +} + +function readEvalMeta(input: unknown): EvalMeta | undefined { + if (!isObject(input)) return undefined; + + const avgScore = readFiniteOrNull(input.avgScore); + const scores = Array.isArray(input.scores) + ? input.scores.map(readEvalScore).filter(isDefined) + : undefined; + const toolCalls = Array.isArray(input.toolCalls) + ? input.toolCalls.map(readToolCall).filter(isDefined) + : undefined; + + return { + ...(scores ? { scores } : {}), + ...(avgScore !== undefined ? { avgScore } : {}), + ...(input.output !== undefined ? { output: input.output } : {}), + ...(typeof input.thresholdFailed === "boolean" + ? { thresholdFailed: input.thresholdFailed } + : {}), + ...(toolCalls ? { toolCalls } : {}), + }; +} + +function readEvalScore(input: unknown): EvalScore | undefined { + if (!isObject(input)) return undefined; + const score = readFiniteOrNull(input.score); + return { + ...(typeof input.name === "string" ? { name: input.name } : {}), + ...(score !== undefined ? { score } : {}), + ...(isObject(input.metadata) ? { metadata: input.metadata } : {}), + }; +} + +function readHarnessMeta(input: unknown): HarnessMeta | undefined { + if (!isObject(input)) return undefined; + return { + ...(typeof input.name === "string" ? { name: input.name } : {}), + ...(isObject(input.run) ? { run: input.run } : {}), + }; +} + +function readToolCall(input: unknown): ToolCallRecord | undefined { + if (!isObject(input)) return undefined; + return input; +} + +function filteredNormalizedSpanAttributes( + attributes: Record | undefined, +): Record { + if (!attributes) return {}; + + return Object.fromEntries( + Object.entries(attributes).filter( + ([key]) => !RESERVED_NORMALIZED_SPAN_ATTRIBUTE_KEYS.has(key), + ), + ); +} + +function readFiniteOrNull(value: unknown): number | null | undefined { + if (value === null) return null; + if (typeof value === "number" && Number.isFinite(value)) { + return value; + } + return undefined; +} + +function isDefined(value: T | undefined): value is T { + return value !== undefined; +} + +function firstUserMessageContent(run: HarnessRun | undefined): unknown { + return run?.session?.messages?.find((message) => message.role === "user") + ?.content; +} + +function toolCallsFromRun(run: HarnessRun | undefined): ToolCallRecord[] { + const calls: ToolCallRecord[] = []; + for (const message of run?.session?.messages ?? []) { + if (Array.isArray(message.toolCalls)) { + calls.push(...message.toolCalls); + } + } + return calls; +} + +function toolCallsFromMeta( + evalMeta: EvalMeta | undefined, + run: HarnessRun | undefined, +): ToolCallRecord[] { + const runCalls = toolCallsFromRun(run); + return runCalls.length > 0 ? runCalls : (evalMeta?.toolCalls ?? []); +} + +function spanTypeForNormalizedKind( + kind: string | undefined, +): SpanTypeAttribute { + switch (kind) { + case "model": + return SpanTypeAttribute.LLM; + case "tool": + return SpanTypeAttribute.TOOL; + case "agent": + case "run": + return SpanTypeAttribute.TASK; + default: + return SpanTypeAttribute.FUNCTION; + } +} + +function startTimeSeconds( + diagnostic: ReturnType | undefined, +): number | undefined { + return diagnostic?.startTime === undefined + ? undefined + : diagnostic.startTime / 1000; +} + +function epochSeconds(value: string | undefined): number | undefined { + if (value === undefined) return undefined; + const ms = Date.parse(value); + return Number.isFinite(ms) ? ms / 1000 : undefined; +} + +function logReporterError(span: Span, error: unknown): void { + if (error instanceof Error) { + logError(span, error); + return; + } + + if (isObject(error)) { + const message = + typeof error.message === "string" ? error.message : undefined; + const stack = typeof error.stack === "string" ? error.stack : undefined; + + if (message !== undefined || stack !== undefined) { + span.log({ + error: stack ? `${message ?? ""}\n\n${stack}` : message, + }); + return; + } + } + + logError(span, error); +} + +function formatErrorMessage(error: unknown): string { + if (isObject(error)) { + if (typeof error.message === "string") return error.message; + if (typeof error.stack === "string") return error.stack; + } + if (error instanceof Error) return error.message; + return String(error); +} diff --git a/js/tsup.config.ts b/js/tsup.config.ts index aec965614..9ee12a9d0 100644 --- a/js/tsup.config.ts +++ b/js/tsup.config.ts @@ -7,6 +7,7 @@ export default defineConfig([ index: "src/node/index.ts", "apply-auto-instrumentation": "src/node/apply-auto-instrumentation-entry.ts", + "vitest-evals-reporter": "src/wrappers/vitest-evals/reporter.ts", }, format: ["cjs", "esm"], outDir: "dist",