diff --git a/.changeset/all-goats-double.md b/.changeset/all-goats-double.md
new file mode 100644
index 000000000..02d5bd4f4
--- /dev/null
+++ b/.changeset/all-goats-double.md
@@ -0,0 +1,5 @@
+---
+"braintrust": minor
+---
+
+feat: Add reporter for `vitest-evals`
diff --git a/e2e/config/pr-comment-scenarios.json b/e2e/config/pr-comment-scenarios.json
index ee98b2fba..6184821ba 100644
--- a/e2e/config/pr-comment-scenarios.json
+++ b/e2e/config/pr-comment-scenarios.json
@@ -196,6 +196,28 @@
       { "variantKey": "github-copilot-v0-auto", "label": "Auto-hook" }
     ]
   },
+  {
+    "scenarioDirName": "test-framework-evals-vitest",
+    "label": "Vitest Test Framework Evals",
+    "metadataScenario": "test-framework-evals-vitest",
+    "variants": [
+      { "variantKey": "v2", "label": "v2" },
+      { "variantKey": "v3", "label": "v3" },
+      { "variantKey": "v4.1", "label": "v4.1" },
+      {
+        "variantKey": "vitest-evals-reporter",
+        "label": "vitest-evals reporter"
+      }
+    ],
+    "evals": [
+      {
+        "entry": "scenario.vitest-evals-reporter.ts",
+        "experimentNameTemplate": "vitest-evals-reporter-{testRunId}",
+        "label": "vitest-evals reporter",
+        "variantKey": "vitest-evals-reporter"
+      }
+    ]
+  },
   {
     "scenarioDirName": "openai-agents-instrumentation",
     "label": "OpenAI Agents Instrumentation",
diff --git a/e2e/helpers/mock-braintrust-server.ts b/e2e/helpers/mock-braintrust-server.ts
index 1dc9e0dc9..d6f053011 100644
--- a/e2e/helpers/mock-braintrust-server.ts
+++ b/e2e/helpers/mock-braintrust-server.ts
@@ -72,6 +72,19 @@ interface StartMockBraintrustServerOptions {
 }
 
 const DEFAULT_API_KEY = "mock-braintrust-api-key";
+const PROD_FORWARDING_SKIPPED_HEADERS = new Set([
+  "authorization",
+  "connection",
+  "content-length",
+  "host",
+  "keep-alive",
+  "proxy-authenticate",
+  "proxy-authorization",
+  "te",
+  "trailer",
+  "transfer-encoding",
+  "upgrade",
+]);
 
 function isRecord(value: unknown): value is Record<string, unknown> {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -265,6 +278,7 @@ export async function startMockBraintrustServer(
   const events: CapturedLogEvent[] = [];
   const mergedRows = new Map<string, CapturedLogRow>();
   const projectsByName = new Map<string, { id: string; name: string }>();
+  const prodForwardingErrors: string[] = [];
   const experimentsByProjectAndName = new Map<
     string,
     {
@@ -382,18 +396,55 @@ export async function startMockBraintrustServer(
     return created;
   }
 
-  function trackProdForwarding(promise: Promise<void>): void {
+  function recordProdForwardingError(context: string, error: unknown): void {
+    prodForwardingErrors.push(
+      `${context}: ${error instanceof Error ? error.message : String(error)}`,
+    );
+  }
+
+  function trackProdForwarding(context: string, promise: Promise<void>): void {
     pendingProdForwarding.add(promise);
     void promise.then(
       () => {
         pendingProdForwarding.delete(promise);
       },
-      () => {
+      (error) => {
+        recordProdForwardingError(context, error);
         pendingProdForwarding.delete(promise);
       },
     );
   }
 
+  function requestForProdForwarding(
+    capturedRequest: CapturedRequest,
+  ): CapturedRequest {
+    if (!prodForwarding || !isRecord(capturedRequest.jsonBody)) {
+      return capturedRequest;
+    }
+
+    const jsonBody = clone(capturedRequest.jsonBody) as Record<string, unknown>;
+    let changed = false;
+
+    if ("org_id" in jsonBody) {
+      jsonBody.org_id = prodForwarding.orgId;
+      changed = true;
+    }
+    if ("org_name" in jsonBody) {
+      jsonBody.org_name = prodForwarding.orgName;
+      changed = true;
+    }
+
+    if (!changed) {
+      return capturedRequest;
+    }
+
+    return {
+      ...capturedRequest,
+      jsonBody: jsonBody as JsonValue,
+      rawBody: JSON.stringify(jsonBody),
+    };
+  }
+
   async function forwardProdRequest(
     capturedRequest: CapturedRequest,
     options: { drainResponseBody?: boolean } = {},
@@ -402,22 +453,18 @@ export async function startMockBraintrustServer(
       throw new Error("prodForwarding is not enabled");
     }
 
-    const baseUrl = capturedRequest.path.startsWith("/api/")
+    const prodRequest = requestForProdForwarding(capturedRequest);
+    const baseUrl = prodRequest.path.startsWith("/api/")
       ? prodForwarding.appUrl
       : prodForwarding.apiUrl;
-    const url = new URL(capturedRequest.path, baseUrl);
-    for (const [key, value] of Object.entries(capturedRequest.query)) {
+    const url = new URL(prodRequest.path, baseUrl);
+    for (const [key, value] of Object.entries(prodRequest.query)) {
       url.searchParams.set(key, value);
     }
 
     const headers = new Headers();
-    for (const [key, value] of Object.entries(capturedRequest.headers)) {
-      if (
-        key === "authorization" ||
-        key === "connection" ||
-        key === "content-length" ||
-        key === "host"
-      ) {
+    for (const [key, value] of Object.entries(prodRequest.headers)) {
+      if (PROD_FORWARDING_SKIPPED_HEADERS.has(key)) {
         continue;
       }
 
@@ -427,16 +474,19 @@ export async function startMockBraintrustServer(
 
     const response = await fetch(url, {
       body:
-        capturedRequest.method === "GET" || capturedRequest.method === "HEAD"
+        prodRequest.method === "GET" || prodRequest.method === "HEAD"
           ? undefined
-          : capturedRequest.rawBody,
+          : prodRequest.rawBody,
       headers,
-      method: capturedRequest.method,
+      method: prodRequest.method,
     });
 
     if (!response.ok) {
+      const responseText = await response.text().catch(() => "");
       throw new Error(
-        `prodForwarding failed for ${capturedRequest.method} ${capturedRequest.path}: ${response.status} ${response.statusText}`,
+        `prodForwarding failed for ${capturedRequest.method} ${capturedRequest.path}: ${response.status} ${response.statusText}${
+          responseText ? `: ${responseText.slice(0, 500)}` : ""
+        }`,
       );
     }
 
@@ -523,7 +573,8 @@ export async function startMockBraintrustServer(
                 });
                 return;
               }
-            } catch {
+            } catch (error) {
+              recordProdForwardingError("POST /api/project/register", error);
               // Fall back to local registration so e2e assertions still run.
             }
           }
@@ -583,7 +634,8 @@ export async function startMockBraintrustServer(
                 });
                 return;
               }
-            } catch {
+            } catch (error) {
+              recordProdForwardingError("POST /api/experiment/register", error);
               // Fall back to local registration so e2e assertions still run.
             }
           }
@@ -668,11 +720,10 @@ export async function startMockBraintrustServer(
           }
           if (prodForwarding) {
             trackProdForwarding(
+              "POST /logs3",
               forwardProdRequest(capturedRequest, {
                 drainResponseBody: true,
-              })
-                .then(() => undefined)
-                .catch(() => undefined),
+              }).then(() => undefined),
             );
           }
           respondJson(res, 200, { ok: true });
@@ -685,11 +736,10 @@ export async function startMockBraintrustServer(
         ) {
           if (prodForwarding) {
             trackProdForwarding(
+              "POST /otel/v1/traces",
               forwardProdRequest(capturedRequest, {
                 drainResponseBody: true,
-              })
-                .then(() => undefined)
-                .catch(() => undefined),
+              }).then(() => undefined),
             );
           }
           respondJson(res, 200, { ok: true });
@@ -723,6 +773,14 @@ export async function startMockBraintrustServer(
       while (pendingProdForwarding.size > 0) {
         await Promise.allSettled([...pendingProdForwarding]);
       }
+      if (prodForwardingErrors.length > 0) {
+        throw new Error(
+          [
+            "Braintrust prod forwarding failed:",
+            ...prodForwardingErrors.map((message) => `- ${message}`),
+          ].join("\n"),
+        );
+      }
     },
     events,
     payloads,
diff --git a/e2e/helpers/prod-forwarding.ts b/e2e/helpers/prod-forwarding.ts
index 3c4a88e0d..522aedbe6 100644
--- a/e2e/helpers/prod-forwarding.ts
+++ b/e2e/helpers/prod-forwarding.ts
@@ -6,6 +6,8 @@ export interface ProdForwarding {
   apiKey: string;
   apiUrl: string;
   appUrl: string;
+  orgId: string;
+  orgName: string;
   projectId: string;
   projectName: string;
 }
@@ -36,7 +38,7 @@ export async function initializeProdForwarding(): Promise<void> {
     const projectId = await logger.id;
     const state = logger.loggingState;
 
-    if (!state.apiUrl || !state.appUrl) {
+    if (!state.apiUrl || !state.appUrl || !state.orgId || !state.orgName) {
       throw new Error("Braintrust login did not resolve prodForwarding URLs");
     }
 
@@ -44,6 +46,8 @@ export async function initializeProdForwarding(): Promise<void> {
       apiKey,
       apiUrl: state.apiUrl,
       appUrl: state.appUrl,
+      orgId: state.orgId,
+      orgName: state.orgName,
       projectId,
       projectName,
     };
diff --git a/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.json b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.json
new file mode 100644
index 000000000..7150b7c62
--- /dev/null
+++ b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.json
@@ -0,0 +1,98 @@
+{
+  "span_tree": [
+    {
+      "name": "vitest-evals braintrust reporter > approves refundable invoice",
+      "type": "eval",
+      "children": [
+        {
+          "name": "classify refund",
+          "type": "llm",
+          "children": [
+            {
+              "name": "lookupInvoice",
+              "type": "tool",
+              "children": [],
+              "attributes": {
+                "external_parent_id": "model-span",
+                "external_span_id": "tool-span",
+                "gen_ai.tool.name": "lookupInvoice",
+                "trace_id": "refund-trace",
+                "vitest_evals_kind": "tool"
+              },
+              "metadata": {
+                "traceName": "refund trace"
+              }
+            }
+          ],
+          "attributes": {
+            "external_span_id": "model-span",
+            "gen_ai.request.model": "deterministic-refund-model",
+            "trace_id": "refund-trace",
+            "vitest_evals_kind": "model"
+          },
+          "metadata": {
+            "traceName": "refund trace"
+          }
+        }
+      ],
+      "attributes": {
+        "framework": "vitest",
+        "reporter": "vitest-evals"
+      },
+      "input": {
+        "input": "Refund invoice inv_123",
+        "test": "vitest-evals braintrust reporter > approves refundable invoice"
+      },
+      "output": {
+        "message": "Invoice inv_123 is refundable and the refund is approved.",
+        "status": "approved"
+      },
+      "scores": {
+        "StatusJudge": 1,
+        "avg_score": 1,
+        "pass": 1
+      },
+      "metadata": {
+        "artifacts": {
+          "case": "vitest-evals-reporter",
+          "scenario": "test-framework-evals-vitest",
+          "testRunId": "<run:1>"
+        },
+        "errors": [],
+        "failureMessages": [],
+        "file": "<repo>/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts",
+        "fullName": "vitest-evals braintrust reporter > approves refundable invoice",
+        "harnessName": "braintrust-refund-harness",
+        "relativeFile": "runner.vitest-evals-reporter.case.ts",
+        "scoreMetadata": {
+          "StatusJudge": {
+            "expectedStatus": "approved",
+            "observedStatus": "approved"
+          }
+        },
+        "session": {
+          "messages": [
+            {
+              "content": "Refund invoice inv_123",
+              "role": "user"
+            },
+            {
+              "content": "Invoice inv_123 is refundable and the refund is approved.",
+              "role": "assistant"
+            }
+          ]
+        },
+        "status": "passed",
+        "testId": "-2057137040_0_0",
+        "thresholdFailed": false
+      },
+      "metrics": {
+        "duration_ms": 0,
+        "input_tokens": 11,
+        "output_tokens": 13,
+        "tool_calls": 1,
+        "total_tokens": 24
+      }
+    }
+  ]
+}
diff --git a/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.txt b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.txt
new file mode 100644
index 000000000..7e9d3df6c
--- /dev/null
+++ b/e2e/scenarios/test-framework-evals-vitest/__snapshots__/vitest-evals-reporter.span-tree.txt
@@ -0,0 +1,81 @@
+span_tree:
+└── vitest-evals braintrust reporter > approves refundable invoice [eval]
+    attributes: {
+      "framework": "vitest",
+      "reporter": "vitest-evals"
+    }
+    input: {
+      "input": "Refund invoice inv_123",
+      "test": "vitest-evals braintrust reporter > approves refundable invoice"
+    }
+    output: {
+      "message": "Invoice inv_123 is refundable and the refund is approved.",
+      "status": "approved"
+    }
+    scores: {
+      "StatusJudge": 1,
+      "avg_score": 1,
+      "pass": 1
+    }
+    metadata: {
+      "artifacts": {
+        "case": "vitest-evals-reporter",
+        "scenario": "test-framework-evals-vitest",
+        "testRunId": "<run:1>"
+      },
+      "errors": [],
+      "failureMessages": [],
+      "file": "<repo>/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts",
+      "fullName": "vitest-evals braintrust reporter > approves refundable invoice",
+      "harnessName": "braintrust-refund-harness",
+      "relativeFile": "runner.vitest-evals-reporter.case.ts",
+      "scoreMetadata": {
+        "StatusJudge": {
+          "expectedStatus": "approved",
+          "observedStatus": "approved"
+        }
+      },
+      "session": {
+        "messages": [
+          {
+            "content": "Refund invoice inv_123",
+            "role": "user"
+          },
+          {
+            "content": "Invoice inv_123 is refundable and the refund is approved.",
+            "role": "assistant"
+          }
+        ]
+      },
+      "status": "passed",
+      "testId": "-2057137040_0_0",
+      "thresholdFailed": false
+    }
+    metrics: {
+      "duration_ms": 0,
+      "input_tokens": 11,
+      "output_tokens": 13,
+      "tool_calls": 1,
+      "total_tokens": 24
+    }
+    └── classify refund [llm]
+        attributes: {
+          "external_span_id": "model-span",
+          "gen_ai.request.model": "deterministic-refund-model",
+          "trace_id": "refund-trace",
+          "vitest_evals_kind": "model"
+        }
+        metadata: {
+          "traceName": "refund trace"
+        }
+        └── lookupInvoice [tool]
+            attributes: {
+              "external_parent_id": "model-span",
+              "external_span_id": "tool-span",
+              "gen_ai.tool.name": "lookupInvoice",
+              "trace_id": "refund-trace",
+              "vitest_evals_kind": "tool"
+            }
+            metadata: {
+              "traceName": "refund trace"
+            }
diff --git a/e2e/scenarios/test-framework-evals-vitest/package.json b/e2e/scenarios/test-framework-evals-vitest/package.json
index df55e7ecb..45a61be58 100644
--- a/e2e/scenarios/test-framework-evals-vitest/package.json
+++ b/e2e/scenarios/test-framework-evals-vitest/package.json
@@ -4,6 +4,7 @@
   "braintrustScenario": {
     "canary": {
       "dependencies": {
+        "vitest": "vitest@4",
         "vitest-v2": "vitest@2.1.9",
         "vitest-v3": "vitest@3",
         "vitest-v4": "vitest@4"
@@ -11,6 +12,9 @@
     }
   },
   "dependencies": {
+    "tinyrainbow": "3.1.0",
+    "vitest": "4.1.5",
+    "vitest-evals": "0.13.1",
     "vitest-v2": "npm:vitest@2.1.9",
     "vitest-v3": "npm:vitest@3.2.4",
     "vitest-v4": "npm:vitest@4.1.5"
diff --git a/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml b/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml
index af7130e4f..ffffecaaf 100644
--- a/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml
+++ b/e2e/scenarios/test-framework-evals-vitest/pnpm-lock.yaml
@@ -8,6 +8,15 @@ importers:
 
   .:
     dependencies:
+      tinyrainbow:
+        specifier: 3.1.0
+        version: 3.1.0
+      vitest:
+        specifier: 4.1.5
+        version: 4.1.5(vite@7.3.1)
+      vitest-evals:
+        specifier: 0.13.1
+        version: 0.13.1(tinyrainbow@3.1.0)(vitest@4.1.5(vite@7.3.1))(zod@4.4.3)
       vitest-v2:
         specifier: npm:vitest@2.1.9
         version: vitest@2.1.9
@@ -467,6 +476,12 @@ packages:
   '@types/estree@1.0.8':
     resolution: {integrity: sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==}
 
+  '@vitest-evals/core@0.13.1':
+    resolution: {integrity: sha512-YX5bRG+J0GCzwJiNoq7UHJVRrtqx07lF3cYUrHnvfRLrn/R5nfBkFkm9eluAYlMFbWehFw+fFIW7bPuyL+3pMg==}
+
+  '@vitest-evals/report-ui@0.13.1':
+    resolution: {integrity: sha512-uA0OSe8UFhSP8i92hUNSFbdJ7Lwi0b06DVfvPb9lnEADgZrExv8IiHy9mkRuU+aMwo7zQI75ZZz1qx07XzPczA==}
+
   '@vitest/expect@2.1.9':
     resolution: {integrity: sha512-UJCIkTBenHeKT1TTlKMJWy1laZewsRIzYighyYiJKZreqtdxSos/S1t+ktRMQWu2CKqaarrkeszJx1cgC5tGZw==}
 
@@ -811,6 +826,20 @@ packages:
       yaml:
         optional: true
 
+  vitest-evals@0.13.1:
+    resolution: {integrity: sha512-UCA3drMFVxtYB3F/0AjQEBSp7EPc2Du2Au85kLHtQg4V6p2mpifP4m5VEfwgxVXq8UfrnsMk8SJvOB/5EiDC0g==}
+    hasBin: true
+    peerDependencies:
+      ai: '>=4 <7'
+      tinyrainbow: '>=2 <4'
+      vitest: '>=4 <5'
+      zod: '>=3 <5'
+    peerDependenciesMeta:
+      ai:
+        optional: true
+      zod:
+        optional: true
+
   vitest@2.1.9:
     resolution: {integrity: sha512-MSmPM9REYqDGBI8439mA4mWhV5sKmDlBKWIYbA3lRb2PTHACE0mgKwA8yQ2xq9vxDTuk4iPrECBAEW2aoFXY0Q==}
     engines: {node: ^18.0.0 || >=20.0.0}
@@ -910,6 +939,9 @@ packages:
     engines: {node: '>=8'}
     hasBin: true
 
+  zod@4.4.3:
+    resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==}
+
 snapshots:
 
   '@esbuild/aix-ppc64@0.21.5':
@@ -1147,6 +1179,14 @@ snapshots:
 
   '@types/estree@1.0.8': {}
 
+  '@vitest-evals/core@0.13.1':
+    dependencies:
+      zod: 4.4.3
+
+  '@vitest-evals/report-ui@0.13.1':
+    dependencies:
+      '@vitest-evals/core': 0.13.1
+
   '@vitest/expect@2.1.9':
     dependencies:
       '@vitest/spy': 2.1.9
@@ -1522,6 +1562,15 @@ snapshots:
     optionalDependencies:
       fsevents: 2.3.3
 
+  vitest-evals@0.13.1(tinyrainbow@3.1.0)(vitest@4.1.5(vite@7.3.1))(zod@4.4.3):
+    dependencies:
+      '@vitest-evals/core': 0.13.1
+      '@vitest-evals/report-ui': 0.13.1
+      tinyrainbow: 3.1.0
+      vitest: 4.1.5(vite@7.3.1)
+    optionalDependencies:
+      zod: 4.4.3
+
   vitest@2.1.9:
     dependencies:
       '@vitest/expect': 2.1.9
@@ -1623,3 +1672,5 @@ snapshots:
     dependencies:
       siginfo: 2.0.0
       stackback: 0.0.2
+
+  zod@4.4.3: {}
diff --git a/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts b/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts
new file mode 100644
index 000000000..0b48fb046
--- /dev/null
+++ b/e2e/scenarios/test-framework-evals-vitest/runner.vitest-evals-reporter.case.ts
@@ -0,0 +1,102 @@
+import { expect } from "vitest";
+import { createHarness, createJudge, describeEval } from "vitest-evals";
+
+const testRunId = process.env.BRAINTRUST_E2E_RUN_ID;
+if (!testRunId) {
+  throw new Error("BRAINTRUST_E2E_RUN_ID is not set");
+}
+
+type RefundOutput = {
+  message: string;
+  status: "approved" | "denied";
+};
+
+const scenario = "test-framework-evals-vitest";
+
+const refundHarness = createHarness<string, RefundOutput>({
+  name: "braintrust-refund-harness",
+  run: async ({ input }) => ({
+    artifacts: {
+      case: "vitest-evals-reporter",
+      scenario,
+      testRunId,
+    },
+    messages: [
+      { role: "user", content: input },
+      {
+        role: "assistant",
+        content: "Invoice inv_123 is refundable and the refund is approved.",
+      },
+    ],
+    output: {
+      message: "Invoice inv_123 is refundable and the refund is approved.",
+      status: "approved",
+    },
+    toolCalls: [
+      {
+        name: "lookupInvoice",
+        arguments: { invoiceId: "inv_123" },
+        result: { refundable: true },
+      },
+    ],
+    traces: [
+      {
+        id: "refund-trace",
+        name: "refund trace",
+        spans: [
+          {
+            id: "model-span",
+            kind: "model",
+            name: "classify refund",
+            attributes: {
+              "gen_ai.request.model": "deterministic-refund-model",
+            },
+          },
+          {
+            id: "tool-span",
+            kind: "tool",
+            name: "lookupInvoice",
+            parentId: "model-span",
+            attributes: {
+              "gen_ai.tool.name": "lookupInvoice",
+            },
+          },
+        ],
+      },
+    ],
+    usage: {
+      inputTokens: 11,
+      outputTokens: 13,
+      totalTokens: 24,
+      toolCalls: 1,
+    },
+  }),
+});
+
+const StatusJudge = createJudge<
+  string,
+  RefundOutput,
+  { expectedStatus: RefundOutput["status"] }
+>("StatusJudge", async ({ output, expectedStatus }) => ({
+  metadata: {
+    expectedStatus,
+    observedStatus: output.status,
+  },
+  score: output.status === expectedStatus ? 1 : 0,
+}));
+
+describeEval(
+  "vitest-evals braintrust reporter",
+  { harness: refundHarness },
+  (it) => {
+    it("approves refundable invoice", async ({ run }) => {
+      const result = await run("Refund invoice inv_123");
+
+      expect(result.output.status).toBe("approved");
+      await expect(result).toSatisfyJudge(StatusJudge, {
+        expectedStatus: "approved",
+        threshold: 1,
+      });
+    });
+  },
+);
diff --git a/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts b/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts
index 969c9965b..ffb2203ab 100644
--- a/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts
+++ b/e2e/scenarios/test-framework-evals-vitest/scenario.test.ts
@@ -16,12 +16,13 @@ const TIMEOUT_MS = 90_000;
 interface VitestScenario {
   entry: string;
   label: string;
+  variantKey: string;
 }
 
 const scenarios: VitestScenario[] = [
-  { entry: "scenario.ts", label: "v2" },
-  { entry: "scenario.vitest-v3.ts", label: "v3" },
-  { entry: "scenario.vitest-v4.ts", label: "v4.1" },
+  { entry: "scenario.ts", label: "v2", variantKey: "v2" },
+  { entry: "scenario.vitest-v3.ts", label: "v3", variantKey: "v3" },
+  { entry: "scenario.vitest-v4.ts", label: "v4.1", variantKey: "v4.1" },
 ];
 
 for (const scenario of scenarios) {
@@ -35,6 +36,10 @@ for (const scenario of scenarios) {
         async ({ runScenarioDir, testRunEvents, testRunId }) => {
           await runScenarioDir({
             entry: scenario.entry,
+            runContext: {
+              cassette: false,
+              variantKey: scenario.variantKey,
+            },
             scenarioDir,
             timeoutMs: TIMEOUT_MS,
           });
@@ -113,3 +118,73 @@ for (const scenario of scenarios) {
     },
   );
 }
+
+test(
+  "test-framework-evals-vitest captures vitest-evals reporter spans",
+  {
+    timeout: TIMEOUT_MS,
+  },
+  async () => {
+    await withScenarioHarness(async ({ events, runScenarioDir, testRunId }) => {
+      await runScenarioDir({
+        entry: "scenario.vitest-evals-reporter.ts",
+        runContext: {
+          cassette: false,
+          variantKey: "vitest-evals-reporter",
+        },
+        scenarioDir,
+        timeoutMs: TIMEOUT_MS,
+      });
+
+      const capturedEvents = events();
+      const evalRoot = findLatestSpan(
+        capturedEvents,
+        "vitest-evals braintrust reporter > approves refundable invoice",
+      );
+      const modelSpan = findLatestSpan(capturedEvents, "classify refund");
+      const toolSpan = findLatestSpan(capturedEvents, "lookupInvoice");
+
+      expect(evalRoot).toBeDefined();
+      expect(evalRoot?.span.type).toBe("eval");
+      expect(evalRoot?.input).toMatchObject({
+        input: "Refund invoice inv_123",
+        test: "vitest-evals braintrust reporter > approves refundable invoice",
+      });
+      expect(evalRoot?.output).toMatchObject({
+        status: "approved",
+      });
+      expect(evalRoot?.scores).toMatchObject({
+        StatusJudge: 1,
+        avg_score: 1,
+        pass: 1,
+      });
+      expect(evalRoot?.metrics).toMatchObject({
+        input_tokens: 11,
+        output_tokens: 13,
+        total_tokens: 24,
+        tool_calls: 1,
+      });
+      expect(evalRoot?.row.metadata).toMatchObject({
+        artifacts: {
+          case: "vitest-evals-reporter",
+          scenario: "test-framework-evals-vitest",
+          testRunId,
+        },
+        harnessName: "braintrust-refund-harness",
+        status: "passed",
+      });
+
+      expect(modelSpan?.span.type).toBe("llm");
+      expect(toolSpan?.span.type).toBe("tool");
+      expect(toolSpan?.span.parentIds).toEqual([modelSpan?.span.id ?? ""]);
+
+      await matchSpanTreeSnapshot(
+        capturedEvents,
+        resolveFileSnapshotPath(
+          import.meta.url,
+          "vitest-evals-reporter.span-tree.json",
+        ),
+      );
+    });
+  },
+);
diff --git a/e2e/scenarios/test-framework-evals-vitest/scenario.vitest-evals-reporter.ts b/e2e/scenarios/test-framework-evals-vitest/scenario.vitest-evals-reporter.ts
new file mode 100644
index 000000000..5c2726fb0
--- /dev/null
+++ b/e2e/scenarios/test-framework-evals-vitest/scenario.vitest-evals-reporter.ts
@@ -0,0 +1,49 @@
+import { createRequire } from "node:module";
+import { promises as fs } from "node:fs";
+import * as path from "node:path";
+import { resolveScenarioDir } from "../../helpers/scenario-harness";
+import {
+  getTestRunId,
+  runMain,
+  runNodeSubprocess,
+} from "../../helpers/scenario-runtime";
+
+const require = createRequire(import.meta.url);
+const scenarioDir = resolveScenarioDir(import.meta.url);
+
+async function findVitestBin(): Promise<string> {
+  const entryPath = require.resolve("vitest");
+  let dir = path.dirname(entryPath);
+  while (dir !== path.dirname(dir)) {
+    const candidate = path.join(dir, "vitest.mjs");
+    try {
+      await fs.access(candidate);
+      return candidate;
+    } catch {
+      // Keep walking upward.
+    }
+    dir = path.dirname(dir);
+  }
+  throw new Error("Could not find vitest.mjs");
+}
+
+async function main() {
+  const vitestCliPath = await findVitestBin();
+  const testRunId = getTestRunId();
+
+  await runNodeSubprocess({
+    args: [
+      vitestCliPath,
+      "run",
+      "--config",
+      "vitest.runner-evals-reporter.config.mts",
+    ],
+    cwd: scenarioDir,
+    env: {
+      BRAINTRUST_E2E_RUN_ID: testRunId,
+    },
+    timeoutMs: 60_000,
+  });
+}
+
+runMain(main);
diff --git a/e2e/scenarios/test-framework-evals-vitest/vitest.runner-evals-reporter.config.mts b/e2e/scenarios/test-framework-evals-vitest/vitest.runner-evals-reporter.config.mts
new file mode 100644
index 000000000..f2a1afe65
--- /dev/null
+++ b/e2e/scenarios/test-framework-evals-vitest/vitest.runner-evals-reporter.config.mts
@@ -0,0 +1,35 @@
+import * as path from "node:path";
+import { pathToFileURL } from "node:url";
+import { defineConfig } from "vitest/config";
+
+const repoRoot = process.env.BRAINTRUST_E2E_REPO_ROOT;
+if (!repoRoot) {
+  throw new Error("BRAINTRUST_E2E_REPO_ROOT is not set");
+}
+
+const testRunId = process.env.BRAINTRUST_E2E_RUN_ID;
+if (!testRunId) {
+  throw new Error("BRAINTRUST_E2E_RUN_ID is not set");
+}
+
+const { default: BraintrustVitestEvalsReporter } = await import(
+  pathToFileURL(path.join(repoRoot, "js/dist/vitest-evals-reporter.mjs")).href
+);
+
+export default defineConfig({
+  test: {
+    hookTimeout: 30_000,
+    include: ["runner.vitest-evals-reporter.case.ts"],
+    reporters: [
+      "vitest-evals/reporter",
+      new BraintrustVitestEvalsReporter({
+        displaySummary: false,
+        experimentName: `vitest-evals-reporter-${testRunId}`,
+        projectName:
+          process.env.BRAINTRUST_E2E_PROJECT_NAME ||
+          `e2e-vitest-evals-reporter-${testRunId}`,
+      }),
+    ],
+    testTimeout: 20_000,
+  },
+});
diff --git a/e2e/scripts/build-pr-e2e-links-comment.mjs b/e2e/scripts/build-pr-e2e-links-comment.mjs
index 7478a809a..f93fe97c9 100644
--- a/e2e/scripts/build-pr-e2e-links-comment.mjs
+++ b/e2e/scripts/build-pr-e2e-links-comment.mjs
@@ -114,8 +114,50 @@ async function readScenarioConfig(configPath) {
           };
         })
       : [];
+    const evals = Array.isArray(entry.evals)
+      ? entry.evals.map((evalEntry, evalIndex) => {
+          if (
+            !evalEntry ||
+            typeof evalEntry !== "object" ||
+            typeof evalEntry.label !== "string" ||
+            typeof evalEntry.experimentNameTemplate !== "string"
+          ) {
+            throw new Error(
+              `Invalid eval at scenario index ${index}, eval index ${evalIndex} in ${configPath}`,
+            );
+          }
+
+          const evalLabel = evalEntry.label.trim();
+          const experimentNameTemplate =
+            evalEntry.experimentNameTemplate.trim();
+          const variantKey =
+            typeof evalEntry.variantKey === "string"
+              ? evalEntry.variantKey.trim()
+              : null;
+          const entry =
+            typeof evalEntry.entry === "string" ? evalEntry.entry.trim() : null;
+          if (!evalLabel || !experimentNameTemplate) {
+            throw new Error(
+              `Eval label/experimentNameTemplate must be non-empty at scenario index ${index}, eval index ${evalIndex} in ${configPath}`,
+            );
+          }
+          if (!experimentNameTemplate.includes("{testRunId}")) {
+            throw new Error(
+              `Eval experimentNameTemplate must include {testRunId} at scenario index ${index}, eval index ${evalIndex} in ${configPath}`,
+            );
+          }
+
+          return {
+            entry: entry || null,
+            experimentNameTemplate,
+            label: evalLabel,
+            variantKey: variantKey || null,
+          };
+        })
+      : [];
 
     return {
+      evals,
       label: entry.label,
       metadataScenario: entry.metadataScenario,
       scenarioDirName: entry.scenarioDirName,
@@ -126,8 +168,9 @@ async function readScenarioConfig(configPath) {
 
 async function readRunContextRecords(runContextDir) {
   const runIdsByScenarioAndVariant = new Map();
+  const records = [];
   if (!runContextDir) {
-    return runIdsByScenarioAndVariant;
+    return { records, runIdsByScenarioAndVariant };
   }
 
   const entries = await readdir(runContextDir, { withFileTypes: true });
@@ -171,6 +214,12 @@ async function readRunContextRecords(runContextDir) {
         typeof parsed.variantKey === "string" && parsed.variantKey.trim()
           ? parsed.variantKey.trim()
           : DEFAULT_VARIANT_KEY;
+      records.push({
+        entry: typeof parsed.entry === "string" ? parsed.entry : null,
+        scenarioDirName,
+        testRunId: parsed.testRunId,
+        variantKey,
+      });
       if (!runIdsByScenarioAndVariant.has(scenarioDirName)) {
         runIdsByScenarioAndVariant.set(scenarioDirName, new Map());
       }
@@ -183,7 +232,7 @@ async function readRunContextRecords(runContextDir) {
     }
   }
 
-  return runIdsByScenarioAndVariant;
+  return { records, runIdsByScenarioAndVariant };
 }
 
 async function resolveOrgName() {
@@ -265,6 +314,41 @@ function buildLogsUrl({ appUrl, orgName, projectName, search }) {
   return url.toString();
 }
 
+function buildExperimentUrl({ appUrl, orgName, projectName, experimentName }) {
+  return new URL(
+    `/app/${encodeURIComponent(orgName)}/p/${encodeURIComponent(projectName)}/experiments/${encodeURIComponent(experimentName)}`,
+    appUrl,
+  ).toString();
+}
+
+function observedRunIdsForEval(runContextRecords, scenario, evalConfig) {
+  return [
+    ...new Set(
+      runContextRecords
+        .filter((record) => {
+          if (record.scenarioDirName !== scenario.scenarioDirName) {
+            return false;
+          }
+          if (
+            evalConfig.variantKey &&
+            record.variantKey !== evalConfig.variantKey
+          ) {
+            return false;
+          }
+          if (evalConfig.entry && record.entry !== evalConfig.entry) {
+            return false;
+          }
+          return true;
+        })
+        .map((record) => record.testRunId),
+    ),
+  ].sort();
+}
+
+function experimentNameForRunId(evalConfig, testRunId) {
+  return evalConfig.experimentNameTemplate.replaceAll("{testRunId}", testRunId);
+}
+
 function buildCommentBody(options) {
   const includeCommentMarker =
     process.env.BRAINTRUST_E2E_INCLUDE_COMMENT_MARKER === "1";
@@ -388,6 +472,59 @@ function buildCommentBody(options) {
     }
   }
 
+  const evalConfigs = options.scenarios.flatMap((scenario) =>
+    (scenario.evals ?? []).map((evalConfig) => ({ evalConfig, scenario })),
+  );
+  if (evalConfigs.length > 0) {
+    lines.push("");
+    lines.push("## E2E Braintrust Evals");
+    lines.push("");
+    lines.push("| Eval | Braintrust Eval | Status |");
+    lines.push("| --- | --- | --- |");
+
+    for (const { evalConfig, scenario } of evalConfigs) {
+      const observedRunIds = observedRunIdsForEval(
+        options.runContextRecords,
+        scenario,
+        evalConfig,
+      );
+      const rowLabel = `${scenario.label} (${evalConfig.label})`;
+
+      if (observedRunIds.length === 0) {
+        lines.push(`| ${rowLabel} | N/A | Not observed in this run |`);
+        continue;
+      }
+
+      if (!options.orgName) {
+        lines.push(`| ${rowLabel} | N/A | Observed (link unavailable) |`);
+        continue;
+      }
+
+      const links = observedRunIds
+        .map((testRunId, index) => {
+          const experimentName = experimentNameForRunId(evalConfig, testRunId);
+          const experimentUrl = buildExperimentUrl({
+            appUrl: options.appPublicUrl,
+            experimentName,
+            orgName: options.orgName,
+            projectName: options.projectName,
+          });
+          const linkLabel =
+            observedRunIds.length === 1
+              ? "Open eval"
+              : `Open eval ${index + 1}`;
+          return `[${linkLabel}](${experimentUrl})`;
+        })
+        .join("<br>");
+      const runCount = observedRunIds.length;
+      const runWord = runCount === 1 ? "run" : "runs";
+
+      lines.push(
+        `| ${rowLabel} | ${links} | Observed (${runCount} ${runWord}) |`,
+      );
+    }
+  }
+
   lines.push("");
   return lines.join("\n");
 }
@@ -405,11 +542,12 @@ async function main() {
     );
   }
 
-  const [scenarios, runIdsByScenarioAndVariant, orgResult] = await Promise.all([
+  const [scenarios, runContext, orgResult] = await Promise.all([
     readScenarioConfig(configPath),
     readRunContextRecords(runContextDir),
     resolveOrgName(),
   ]);
+  const { records: runContextRecords, runIdsByScenarioAndVariant } = runContext;
 
   const recordsFound = [...runIdsByScenarioAndVariant.values()].reduce(
     (count, variants) =>
@@ -433,6 +571,7 @@ async function main() {
     projectName,
     recordsFound,
     runIdsByScenarioAndVariant,
+    runContextRecords,
     scenarios,
     warning: orgResult.warning,
   });
diff --git a/js/package.json b/js/package.json
index 123468198..aad229bb7 100644
--- a/js/package.json
+++ b/js/package.json
@@ -65,6 +65,12 @@
       "require": "./dist/apply-auto-instrumentation.js",
       "default": "./dist/apply-auto-instrumentation.mjs"
     },
+    "./vitest-evals-reporter": {
+      "types": "./dist/vitest-evals-reporter.d.ts",
+      "import": "./dist/vitest-evals-reporter.mjs",
+      "module": "./dist/vitest-evals-reporter.mjs",
+      "require": "./dist/vitest-evals-reporter.js"
+    },
     "./node": {
       "types": "./dist/index.d.ts",
       "import": "./dist/index.mjs",
diff --git a/js/src/wrappers/vitest-evals/reporter.test.ts b/js/src/wrappers/vitest-evals/reporter.test.ts
new file mode 100644
index 000000000..8d4d3c97a
--- /dev/null
+++ b/js/src/wrappers/vitest-evals/reporter.test.ts
@@ -0,0 +1,404 @@
+import { beforeAll, beforeEach, describe, expect, test, vi } from "vitest";
+import BraintrustVitestEvalsReporter from "./reporter";
+import { configureNode } from "../../node/config";
+import {
+  _exportsForTestingOnly,
+  type TestBackgroundLogger,
+} from "../../logger";
+import * as logger from "../../logger";
+
+configureNode();
+
+let backgroundLogger: TestBackgroundLogger;
+
+beforeAll(async () => {
+  _exportsForTestingOnly.setInitialTestState();
+  await _exportsForTestingOnly.simulateLoginForTests();
+  backgroundLogger = _exportsForTestingOnly.useTestBackgroundLogger();
+
+  vi.spyOn(logger, "initExperiment").mockImplementation(
+    (projectOrOptions: string | any, options?: any) => {
+      const experimentOptions =
+        typeof projectOrOptions === "string" ? options : projectOrOptions;
+      const projectName =
+        typeof projectOrOptions === "string"
+          ? projectOrOptions
+          : (projectOrOptions.project ??
+            projectOrOptions.projectId ??
+            "test-project");
+
+      return _exportsForTestingOnly.initTestExperiment(
+        experimentOptions?.experiment || "test-experiment",
+        projectName,
+      );
+    },
+  );
+});
+
+beforeEach(async () => {
+  await backgroundLogger.drain();
+});
+
+describe("Braintrust vitest-evals reporter", () => {
+  test("does nothing when no eval metadata is present", async () => {
+    const reporter = new BraintrustVitestEvalsReporter();
+
+    await reporter.onTestRunEnd([
+      fakeModule([fakeTest({ meta: {}, name: "plain test" })]),
+    ] as any);
+
+    await backgroundLogger.flush();
+    expect(await backgroundLogger.drain()).toHaveLength(0);
+  });
+
+  test("requires a project only when eval cases are reported", async () => {
+    const reporter = new BraintrustVitestEvalsReporter();
+
+    await expect(
+      reporter.onTestRunEnd([
+        fakeModule([
+          fakeTest({
+            meta: { eval: { avgScore: 1 } },
+            name: "eval test",
+          }),
+        ]),
+      ] as any),
+    ).rejects.toThrow("projectName or projectId");
+  });
+
+  test("logs eval metadata, usage metrics, and normalized traces", async () => {
+    const reporter = new BraintrustVitestEvalsReporter({
+      displaySummary: false,
+      experimentName: "vitest-evals-unit-test",
+      projectName: "vitest-evals-tests",
+    });
+    const module = fakeModule([
+      fakeTest({
+        diagnostic: { duration: 125, startTime: 1_700_000_000_000 },
+        fullName: "refund eval > approves refund",
+        location: { line: 42, column: 7 },
+        meta: {
+          eval: {
+            avgScore: 0.9,
+            output: { status: "approved" },
+            scores: [
+              {
+                name: "FactualityJudge",
+                score: 0.8,
+                metadata: { rationale: "close enough" },
+              },
+            ],
+          },
+          harness: {
+            name: "refund-harness",
+            run: {
+              session: {
+                messages: [
+                  { role: "user", content: "Refund invoice inv_123" },
+                  {
+                    role: "assistant",
+                    content: { status: "approved" },
+                  },
+                ],
+              },
+              usage: {
+                inputTokens: 10,
+                outputTokens: 15,
+                reasoningTokens: 2,
+                totalTokens: 27,
+                toolCalls: 1,
+              },
+              artifacts: { invoiceId: "inv_123" },
+              traces: [
+                {
+                  id: "trace-1",
+                  name: "refund trace",
+                  spans: [
+                    {
+                      id: "model-1",
+                      kind: "model",
+                      name: "classify refund",
+                      startedAt: "2026-01-01T00:00:00.000Z",
+                      finishedAt: "2026-01-01T00:00:00.050Z",
+                      attributes: {
+                        "custom.attribute": "preserved",
+                        "gen_ai.request.model": "gpt-test",
+                        external_span_id: "user-external-span-id",
+                        name: "user span name",
+                        status: "user-status",
+                        trace_id: "user-trace-id",
+                        type: "custom",
+                        vitest_evals_kind: "custom",
+                      },
+                    },
+                    {
+                      id: "tool-1",
+                      parentId: "model-1",
+                      kind: "tool",
+                      name: "lookupInvoice",
+                      attributes: { "gen_ai.tool.name": "lookupInvoice" },
+                    },
+                  ],
+                },
+              ],
+            },
+          },
+        },
+        name: "approves refund",
+        tags: ["refund", "happy-path"],
+      }),
+    ]);
+
+    await reporter.onTestRunEnd([module] as any);
+    await backgroundLogger.flush();
+    const rows = await backgroundLogger.drain();
+
+    const root = rows.find((row: any) => row.scores?.FactualityJudge === 0.8);
+    expect(root).toMatchObject({
+      input: {
+        input: "Refund invoice inv_123",
+        test: "refund eval > approves refund",
+      },
+      metrics: {
+        duration_ms: 125,
+        input_tokens: 10,
+        output_tokens: 15,
+        reasoning_tokens: 2,
+        total_tokens: 27,
+        tool_calls: 1,
+      },
+      output: { status: "approved" },
+      scores: {
+        avg_score: 0.9,
+        FactualityJudge: 0.8,
+        pass: 1,
+      },
+      tags: ["refund", "happy-path"],
+    });
+    expect(root?.metadata).toMatchObject({
+      artifacts: { invoiceId: "inv_123" },
+      file: "/repo/evals/refund.eval.ts",
+      harnessName: "refund-harness",
+      location: { line: 42, column: 7 },
+      status: "passed",
+      scoreMetadata: {
+        FactualityJudge: { rationale: "close enough" },
+      },
+    });
+
+    const modelSpan = rows.find(
+      (row: any) => row.span_attributes?.name === "classify refund",
+    );
+    const toolSpan = rows.find(
+      (row: any) => row.span_attributes?.name === "lookupInvoice",
+    );
+
+    expect(modelSpan?.span_attributes).toMatchObject({
+      "custom.attribute": "preserved",
+      "gen_ai.request.model": "gpt-test",
+      name: "classify refund",
+      type: "llm",
+      vitest_evals_kind: "model",
+      trace_id: "trace-1",
+      external_span_id: "model-1",
+    });
+    expect(toolSpan?.span_attributes).toMatchObject({
+      type: "tool",
+      vitest_evals_kind: "tool",
+      external_parent_id: "model-1",
+    });
+    expect(toolSpan?.span_parents).toEqual([modelSpan?.span_id]);
+  });
+
+  test("logs failed eval scores and failure metadata", async () => {
+    const reporter = new BraintrustVitestEvalsReporter({
+      displaySummary: false,
+      projectName: "vitest-evals-tests",
+    });
+
+    await reporter.onTestRunEnd([
+      fakeModule([
+        fakeTest({
+          meta: {
+            eval: {
+              avgScore: 0.4,
+              output: { status: "denied" },
+              scores: [{ name: "StatusJudge", score: 0 }],
+              thresholdFailed: true,
+            },
+            harness: {
+              run: {
+                errors: [{ message: "application run failed" }],
+                session: {
+                  messages: [{ role: "user", content: "Refund inv_bad" }],
+                },
+                usage: {},
+              },
+            },
+          },
+          name: "failed eval",
+          result: {
+            errors: [
+              {
+                message: "expected score to meet threshold",
+                stack: "AssertionError: expected score to meet threshold",
+              },
+            ],
+            state: "failed",
+          },
+        }),
+      ]),
+    ] as any);
+
+    await backgroundLogger.flush();
+    const rows = await backgroundLogger.drain();
+    const root = rows.find((row: any) => row.scores?.StatusJudge === 0);
+
+    expect(root?.scores).toMatchObject({
+      StatusJudge: 0,
+      avg_score: 0.4,
+      pass: 0,
+    });
+    expect(root?.metadata).toMatchObject({
+      errors: [{ message: "application run failed" }],
+      failureMessages: ["expected score to meet threshold"],
+      status: "failed",
+      thresholdFailed: true,
+    });
+    const errorRow = rows.find((row: any) => typeof row.error === "string");
+    expect(errorRow?.error).toContain("expected score to meet threshold");
+    expect(errorRow?.error).toContain(
+      "AssertionError: expected score to meet threshold",
+    );
+    expect(errorRow?.error).not.toContain("[object Object]");
+  });
+
+  test("logs fallback tool spans when no normalized traces are present", async () => {
+    const reporter = new BraintrustVitestEvalsReporter({
+      displaySummary: false,
+      projectId: "project-id",
+    });
+
+    await reporter.onTestRunEnd([
+      fakeModule([
+        fakeTest({
+          meta: {
+            eval: { avgScore: 1 },
+            harness: {
+              run: {
+                output: "done",
+                session: {
+                  messages: [
+                    {
+                      role: "assistant",
+                      toolCalls: [
+                        {
+                          name: "searchDocs",
+                          arguments: { query: "refunds" },
+                          result: { count: 2 },
+                          durationMs: 12,
+                        },
+                      ],
+                    },
+                  ],
+                },
+                usage: {},
+              },
+            },
+          },
+          name: "tool fallback",
+        }),
+        fakeTest({
+          meta: {
+            eval: {
+              avgScore: 1,
+              toolCalls: [
+                {
+                  name: "lookupLegacy",
+                  arguments: { id: "legacy" },
+                  result: { ok: true },
+                },
+              ],
+            },
+            harness: {
+              run: {
+                output: "done",
+                session: { messages: [] },
+                usage: {},
+              },
+            },
+          },
+          name: "eval tool fallback",
+        }),
+      ]),
+    ] as any);
+
+    await backgroundLogger.flush();
+    const rows = await backgroundLogger.drain();
+    const toolSpan = rows.find(
+      (row: any) => row.span_attributes?.name === "searchDocs",
+    );
+    const evalToolSpan = rows.find(
+      (row: any) => row.span_attributes?.name === "lookupLegacy",
+    );
+
+    expect(toolSpan).toMatchObject({
+      input: { query: "refunds" },
+      metrics: { duration_ms: 12 },
+      output: { count: 2 },
+      span_attributes: { type: "tool" },
+    });
+    expect(evalToolSpan).toMatchObject({
+      input: { id: "legacy" },
+      output: { ok: true },
+      span_attributes: { type: "tool" },
+    });
+  });
+});
+
+function fakeModule(tests: any[]) {
+  const module = {
+    children: {
+      allTests: function* () {
+        yield* tests;
+      },
+    },
+    moduleId: "/repo/evals/refund.eval.ts",
+    relativeModuleId: "evals/refund.eval.ts",
+  };
+
+  for (const test of tests) {
+    test.module = module;
+  }
+
+  return module;
+}
+
+function fakeTest({
+  diagnostic = { duration: 50, startTime: 1_700_000_000_000 },
+  fullName,
+  location = { line: 1, column: 1 },
+  meta,
+  name,
+  result = { state: "passed" },
+  tags = [],
+}: {
+  diagnostic?: { duration: number; startTime: number };
+  fullName?: string;
+  location?: { line: number; column: number };
+  meta: Record<string, unknown>;
+  name: string;
+  result?: { state: string; errors?: unknown[] };
+  tags?: string[];
+}) {
+  return {
+    diagnostic: () => diagnostic,
+    fullName: fullName ?? name,
+    id: `test:${name}`,
+    location,
+    meta: () => meta,
+    name,
+    result: () => result,
+    tags,
+  };
+}
diff --git a/js/src/wrappers/vitest-evals/reporter.ts b/js/src/wrappers/vitest-evals/reporter.ts
new file mode 100644
index 000000000..e20eaea3d
--- /dev/null
+++ b/js/src/wrappers/vitest-evals/reporter.ts
@@ -0,0 +1,637 @@
+import type { Reporter, TestCase, TestModule, Vitest } from "vitest/node";
+import { configureNode } from "../../node/config";
+import {
+  initExperiment,
+  logError,
+  type Experiment,
+  type Span,
+} from "../../logger";
+import { SpanTypeAttribute, isObject } from "../../../util";
+import { summarizeAndFlush } from "../shared/flush";
+
+configureNode();
+
+interface BraintrustVitestEvalsReporterOptions {
+  projectName?: string;
+  projectId?: string;
+  experimentName?: string;
+  displaySummary?: boolean;
+  metadata?: Record<string, unknown>;
+  tags?: string[];
+  baseExperiment?: string;
+  baseExperimentId?: string;
+}
+
+type EvalScore = {
+  name?: string;
+  score?: number | null;
+  metadata?: Record<string, unknown>;
+};
+
+type EvalMeta = {
+  scores?: EvalScore[];
+  avgScore?: number | null;
+  output?: unknown;
+  thresholdFailed?: boolean;
+  toolCalls?: ToolCallRecord[];
+};
+
+type HarnessMeta = {
+  name?: string;
+  run?: HarnessRun;
+};
+
+type EvalTaskMeta = {
+  eval?: EvalMeta;
+  harness?: HarnessMeta;
+};
+
+type HarnessRun = {
+  output?: unknown;
+  session?: {
+    messages?: Array<{
+      role?: string;
+      content?: unknown;
+      toolCalls?: ToolCallRecord[];
+      metadata?: Record<string, unknown>;
+    }>;
+    provider?: string;
+    model?: string;
+    metadata?: Record<string, unknown>;
+  };
+  usage?: {
+    inputTokens?: number;
+    outputTokens?: number;
+    reasoningTokens?: number;
+    totalTokens?: number;
+    toolCalls?: number;
+    retries?: number;
+    provider?: string;
+    model?: string;
+    metadata?: Record<string, unknown>;
+  };
+  timings?: Record<string, unknown>;
+  artifacts?: Record<string, unknown>;
+  traces?: NormalizedTrace[];
+  errors?: Array<Record<string, unknown>>;
+};
+
+type ToolCallRecord = {
+  id?: string;
+  name?: string;
+  arguments?: unknown;
+  result?: unknown;
+  error?: unknown;
+  startedAt?: string;
+  finishedAt?: string;
+  durationMs?: number;
+  metadata?: Record<string, unknown>;
+};
+
+type NormalizedTrace = {
+  id?: string;
+  name?: string;
+  startedAt?: string;
+  finishedAt?: string;
+  durationMs?: number;
+  metadata?: Record<string, unknown>;
+  spans?: NormalizedSpan[];
+};
+
+type NormalizedSpan = {
+  id?: string;
+  traceId?: string;
+  parentId?: string;
+  name?: string;
+  kind?: string;
+  startedAt?: string;
+  finishedAt?: string;
+  durationMs?: number;
+  status?: string;
+  error?: unknown;
+  attributes?: Record<string, unknown>;
+  events?: unknown[];
+};
+
+type TestLike = Pick<
+  TestCase,
+  | "diagnostic"
+  | "fullName"
+  | "id"
+  | "location"
+  | "meta"
+  | "name"
+  | "result"
+  | "tags"
+> & {
+  module?: Pick<TestModule, "moduleId" | "relativeModuleId">;
+};
+
+type EvalTestCandidate = {
+  meta: EvalTaskMeta | undefined;
+  test: TestLike;
+};
+
+type RunnableEvalTest = {
+  meta: EvalTaskMeta;
+  test: TestLike;
+};
+
+const RESERVED_NORMALIZED_SPAN_ATTRIBUTE_KEYS = new Set([
+  "name",
+  "type",
+  "vitest_evals_kind",
+  "trace_id",
+  "external_span_id",
+  "external_parent_id",
+  "status",
+]);
+
+export default class BraintrustVitestEvalsReporter implements Reporter {
+  private experiment?: Experiment;
+
+  constructor(
+    private readonly options: BraintrustVitestEvalsReporterOptions = {},
+  ) {}
+
+  onInit(_vitest: Vitest): void {
+    // Vitest calls this before a run; keeping the hook declares reporter intent
+    // while all data we need is available from onTestRunEnd.
+  }
+
+  async onTestRunEnd(testModules: ReadonlyArray<TestModule>): Promise<void> {
+    const evalTests: RunnableEvalTest[] = [];
+    for (const testModule of testModules) {
+      for (const test of testModule.children.allTests()) {
+        const candidate = { test, meta: readEvalTaskMeta(test.meta()) };
+        if (isRunnableEvalTest(candidate)) {
+          evalTests.push(candidate);
+        }
+      }
+    }
+
+    if (evalTests.length === 0) {
+      return;
+    }
+
+    const experiment = this.getOrCreateExperiment();
+
+    for (const { test, meta } of evalTests) {
+      logEvalTest(experiment, test, meta);
+    }
+
+    await summarizeAndFlush(experiment, {
+      displaySummary: this.options.displaySummary,
+    });
+    this.experiment = undefined;
+  }
+
+  private getOrCreateExperiment(): Experiment {
+    if (this.experiment) {
+      return this.experiment;
+    }
+
+    const { projectId, projectName } = this.options;
+    if (!projectId && !projectName) {
+      throw new Error(
+        "Braintrust vitest-evals reporter requires projectName or projectId when eval cases are reported.",
+      );
+    }
+
+    const experimentName =
+      this.options.experimentName ?? `vitest-evals-${new Date().toISOString()}`;
+
+    this.experiment = initExperiment({
+      ...(projectId ? { projectId } : { project: projectName }),
+      experiment: experimentName,
+      metadata: this.options.metadata,
+      tags: this.options.tags,
+      baseExperiment: this.options.baseExperiment,
+      baseExperimentId: this.options.baseExperimentId,
+    });
+
+    return this.experiment;
+  }
+}
+
+function isRunnableEvalTest(
+  candidate: EvalTestCandidate,
+): candidate is RunnableEvalTest {
+  if (!candidate.meta) return false;
+
+  const state = candidate.test.result().state;
+  return state !== "skipped" && state !== "pending";
+}
+
+function logEvalTest(
+  experiment: Experiment,
+  test: TestLike,
+  meta: EvalTaskMeta,
+): void {
+  const result = test.result();
+  const diagnostic = test.diagnostic();
+  const run = meta.harness?.run;
+  const output = meta.eval?.output ?? run?.output;
+  const scores = buildScores(result.state, meta.eval);
+  const metrics = buildMetrics(diagnostic?.duration, run);
+  const metadata = buildMetadata(test, meta, run);
+
+  const rootSpan = experiment.startSpan({
+    name: test.fullName || test.name,
+    spanAttributes: {
+      type: SpanTypeAttribute.EVAL,
+      framework: "vitest",
+      reporter: "vitest-evals",
+    },
+    startTime: startTimeSeconds(diagnostic),
+    event: {
+      input: {
+        test: test.fullName || test.name,
+        input: firstUserMessageContent(run),
+      },
+      ...(output !== undefined ? { output } : {}),
+      scores,
+      metrics,
+      metadata,
+      ...(test.tags.length > 0 ? { tags: test.tags } : {}),
+    },
+  });
+
+  if (result.state === "failed") {
+    for (const error of result.errors ?? []) {
+      logReporterError(rootSpan, error);
+    }
+  }
+
+  if (run?.traces?.length) {
+    logNormalizedTraces(rootSpan, run.traces);
+  } else {
+    logToolCallSpans(rootSpan, toolCallsFromMeta(meta.eval, run));
+  }
+
+  rootSpan.end({
+    endTime:
+      startTimeSeconds(diagnostic) !== undefined &&
+      diagnostic?.duration !== undefined
+        ? startTimeSeconds(diagnostic)! + diagnostic.duration / 1000
+        : undefined,
+  });
+}
+
+function buildScores(
+  state: ReturnType<TestLike["result"]>["state"],
+  evalMeta: EvalMeta | undefined,
+): Record<string, number | null> {
+  const scores: Record<string, number | null> = {
+    pass: state === "passed" ? 1 : 0,
+  };
+
+  if (typeof evalMeta?.avgScore === "number" || evalMeta?.avgScore === null) {
+    scores.avg_score = evalMeta.avgScore;
+  }
+
+  for (const score of evalMeta?.scores ?? []) {
+    if (!score.name) continue;
+    if (typeof score.score === "number" || score.score === null) {
+      scores[score.name] = score.score;
+    }
+  }
+
+  return scores;
+}
+
+function buildMetrics(
+  durationMs: number | undefined,
+  run: HarnessRun | undefined,
+): Record<string, unknown> {
+  const usage = run?.usage;
+  const metrics: Record<string, unknown> = {};
+
+  if (durationMs !== undefined) {
+    metrics.duration_ms = durationMs;
+  }
+  if (typeof usage?.inputTokens === "number") {
+    metrics.input_tokens = usage.inputTokens;
+  }
+  if (typeof usage?.outputTokens === "number") {
+    metrics.output_tokens = usage.outputTokens;
+  }
+  if (typeof usage?.reasoningTokens === "number") {
+    metrics.reasoning_tokens = usage.reasoningTokens;
+  }
+  if (typeof usage?.totalTokens === "number") {
+    metrics.total_tokens = usage.totalTokens;
+  }
+  if (typeof usage?.toolCalls === "number") {
+    metrics.tool_calls = usage.toolCalls;
+  }
+  if (typeof usage?.retries === "number") {
+    metrics.retries = usage.retries;
+  }
+
+  return metrics;
+}
+
+function buildMetadata(
+  test: TestLike,
+  meta: EvalTaskMeta,
+  run: HarnessRun | undefined,
+): Record<string, unknown> {
+  const result = test.result();
+  const metadata: Record<string, unknown> = {
+    file: test.module?.moduleId,
+    relativeFile: test.module?.relativeModuleId,
+    fullName: test.fullName,
+    testId: test.id,
+    location: test.location,
+    status: result.state,
+    failureMessages: (result.errors ?? []).map(formatErrorMessage),
+    harnessName: meta.harness?.name,
+    thresholdFailed: meta.eval?.thresholdFailed,
+    session: run?.session,
+    artifacts: run?.artifacts,
+    timings: run?.timings,
+    errors: run?.errors,
+    scoreMetadata: Object.fromEntries(
+      (meta.eval?.scores ?? [])
+        .filter((score) => score.name && score.metadata)
+        .map((score) => [score.name!, score.metadata]),
+    ),
+  };
+
+  return Object.fromEntries(
+    Object.entries(metadata).filter(([, value]) => value !== undefined),
+  );
+}
+
+function logNormalizedTraces(rootSpan: Span, traces: NormalizedTrace[]): void {
+  for (const trace of traces) {
+    const spans = trace.spans ?? [];
+    const spanMap = new Map<string, Span>();
+    const pending = [...spans];
+
+    while (pending.length > 0) {
+      const before = pending.length;
+
+      for (let index = pending.length - 1; index >= 0; index--) {
+        const normalized = pending[index];
+        const parent =
+          normalized.parentId === undefined
+            ? rootSpan
+            : spanMap.get(normalized.parentId);
+
+        if (!parent) continue;
+
+        const span = logNormalizedSpan(parent, normalized, trace);
+        if (normalized.id) {
+          spanMap.set(normalized.id, span);
+        }
+        pending.splice(index, 1);
+      }
+
+      if (pending.length === before) {
+        for (const normalized of pending.splice(0)) {
+          const span = logNormalizedSpan(rootSpan, normalized, trace);
+          if (normalized.id) {
+            spanMap.set(normalized.id, span);
+          }
+        }
+      }
+    }
+  }
+}
+
+function logNormalizedSpan(
+  parent: Span,
+  normalized: NormalizedSpan,
+  trace: NormalizedTrace,
+): Span {
+  const span = parent.startSpan({
+    name: normalized.name ?? normalized.kind ?? "harness span",
+    spanAttributes: {
+      ...filteredNormalizedSpanAttributes(normalized.attributes),
+      type: spanTypeForNormalizedKind(normalized.kind),
+      vitest_evals_kind: normalized.kind,
+      trace_id: normalized.traceId ?? trace.id,
+      external_span_id: normalized.id,
+      external_parent_id: normalized.parentId,
+      status: normalized.status,
+    },
+    startTime: epochSeconds(normalized.startedAt),
+  });
+
+  const metadata: Record<string, unknown> = {
+    traceName: trace.name,
+    traceMetadata: trace.metadata,
+    events: normalized.events,
+  };
+
+  if (Object.values(metadata).some((value) => value !== undefined)) {
+    span.log({
+      metadata: Object.fromEntries(
+        Object.entries(metadata).filter(([, value]) => value !== undefined),
+      ),
+    });
+  }
+  if (normalized.error !== undefined) {
+    logReporterError(span, normalized.error);
+  }
+
+  span.end({ endTime: epochSeconds(normalized.finishedAt) });
+  return span;
+}
+
+function logToolCallSpans(rootSpan: Span, calls: ToolCallRecord[]): void {
+  for (const call of calls) {
+    if (!call.name) continue;
+
+    const span = rootSpan.startSpan({
+      name: call.name,
+      spanAttributes: {
+        type: SpanTypeAttribute.TOOL,
+        tool_call_id: call.id,
+      },
+      startTime: epochSeconds(call.startedAt),
+      event: {
+        input: call.arguments,
+        ...(call.result !== undefined ? { output: call.result } : {}),
+        metadata: call.metadata,
+        metrics:
+          call.durationMs !== undefined
+            ? { duration_ms: call.durationMs }
+            : undefined,
+      },
+    });
+
+    if (call.error !== undefined) {
+      logReporterError(span, call.error);
+    }
+    span.end({ endTime: epochSeconds(call.finishedAt) });
+  }
+}
+
+function readEvalTaskMeta(input: unknown): EvalTaskMeta | undefined {
+  if (!isObject(input)) return undefined;
+
+  const evalMeta = readEvalMeta(input.eval);
+  const harnessMeta = readHarnessMeta(input.harness);
+
+  if (!evalMeta && !harnessMeta) return undefined;
+  return {
+    ...(evalMeta ? { eval: evalMeta } : {}),
+    ...(harnessMeta ? { harness: harnessMeta } : {}),
+  };
+}
+
+function readEvalMeta(input: unknown): EvalMeta | undefined {
+  if (!isObject(input)) return undefined;
+
+  const avgScore = readFiniteOrNull(input.avgScore);
+  const scores = Array.isArray(input.scores)
+    ? input.scores.map(readEvalScore).filter(isDefined)
+    : undefined;
+  const toolCalls = Array.isArray(input.toolCalls)
+    ? input.toolCalls.map(readToolCall).filter(isDefined)
+    : undefined;
+
+  return {
+    ...(scores ? { scores } : {}),
+    ...(avgScore !== undefined ? { avgScore } : {}),
+    ...(input.output !== undefined ? { output: input.output } : {}),
+    ...(typeof input.thresholdFailed === "boolean"
+      ? { thresholdFailed: input.thresholdFailed }
+      : {}),
+    ...(toolCalls ? { toolCalls } : {}),
+  };
+}
+
+function readEvalScore(input: unknown): EvalScore | undefined {
+  if (!isObject(input)) return undefined;
+  const score = readFiniteOrNull(input.score);
+  return {
+    ...(typeof input.name === "string" ? { name: input.name } : {}),
+    ...(score !== undefined ? { score } : {}),
+    ...(isObject(input.metadata) ? { metadata: input.metadata } : {}),
+  };
+}
+
+function readHarnessMeta(input: unknown): HarnessMeta | undefined {
+  if (!isObject(input)) return undefined;
+  return {
+    ...(typeof input.name === "string" ? { name: input.name } : {}),
+    ...(isObject(input.run) ? { run: input.run } : {}),
+  };
+}
+
+function readToolCall(input: unknown): ToolCallRecord | undefined {
+  if (!isObject(input)) return undefined;
+  return input;
+}
+
+function filteredNormalizedSpanAttributes(
+  attributes: Record<string, unknown> | undefined,
+): Record<string, unknown> {
+  if (!attributes) return {};
+
+  return Object.fromEntries(
+    Object.entries(attributes).filter(
+      ([key]) => !RESERVED_NORMALIZED_SPAN_ATTRIBUTE_KEYS.has(key),
+    ),
+  );
+}
+
+function readFiniteOrNull(value: unknown): number | null | undefined {
+  if (value === null) return null;
+  if (typeof value === "number" && Number.isFinite(value)) {
+    return value;
+  }
+  return undefined;
+}
+
+function isDefined<T>(value: T | undefined): value is T {
+  return value !== undefined;
+}
+
+function firstUserMessageContent(run: HarnessRun | undefined): unknown {
+  return run?.session?.messages?.find((message) => message.role === "user")
+    ?.content;
+}
+
+function toolCallsFromRun(run: HarnessRun | undefined): ToolCallRecord[] {
+  const calls: ToolCallRecord[] = [];
+  for (const message of run?.session?.messages ?? []) {
+    if (Array.isArray(message.toolCalls)) {
+      calls.push(...message.toolCalls);
+    }
+  }
+  return calls;
+}
+
+function toolCallsFromMeta(
+  evalMeta: EvalMeta | undefined,
+  run: HarnessRun | undefined,
+): ToolCallRecord[] {
+  const runCalls = toolCallsFromRun(run);
+  return runCalls.length > 0 ? runCalls : (evalMeta?.toolCalls ?? []);
+}
+
+function spanTypeForNormalizedKind(
+  kind: string | undefined,
+): SpanTypeAttribute {
+  switch (kind) {
+    case "model":
+      return SpanTypeAttribute.LLM;
+    case "tool":
+      return SpanTypeAttribute.TOOL;
+    case "agent":
+    case "run":
+      return SpanTypeAttribute.TASK;
+    default:
+      return SpanTypeAttribute.FUNCTION;
+  }
+}
+
+function startTimeSeconds(
+  diagnostic: ReturnType<TestLike["diagnostic"]> | undefined,
+): number | undefined {
+  return diagnostic?.startTime === undefined
+    ? undefined
+    : diagnostic.startTime / 1000;
+}
+
+function epochSeconds(value: string | undefined): number | undefined {
+  if (value === undefined) return undefined;
+  const ms = Date.parse(value);
+  return Number.isFinite(ms) ? ms / 1000 : undefined;
+}
+
+function logReporterError(span: Span, error: unknown): void {
+  if (error instanceof Error) {
+    logError(span, error);
+    return;
+  }
+
+  if (isObject(error)) {
+    const message =
+      typeof error.message === "string" ? error.message : undefined;
+    const stack = typeof error.stack === "string" ? error.stack : undefined;
+
+    if (message !== undefined || stack !== undefined) {
+      span.log({
+        error: stack ? `${message ?? "<error>"}\n\n${stack}` : message,
+      });
+      return;
+    }
+  }
+
+  logError(span, error);
+}
+
+function formatErrorMessage(error: unknown): string {
+  if (isObject(error)) {
+    if (typeof error.message === "string") return error.message;
+    if (typeof error.stack === "string") return error.stack;
+  }
+  if (error instanceof Error) return error.message;
+  return String(error);
+}
diff --git a/js/tsup.config.ts b/js/tsup.config.ts
index aec965614..9ee12a9d0 100644
--- a/js/tsup.config.ts
+++ b/js/tsup.config.ts
@@ -7,6 +7,7 @@ export default defineConfig([
       index: "src/node/index.ts",
       "apply-auto-instrumentation":
         "src/node/apply-auto-instrumentation-entry.ts",
+      "vitest-evals-reporter": "src/wrappers/vitest-evals/reporter.ts",
     },
     format: ["cjs", "esm"],
     outDir: "dist",