From 0e0006458028388570a2efc1142b9aff11a2474f Mon Sep 17 00:00:00 2001
From: Luca Forstner <luca.forstner@gmail.com>
Date: Fri, 19 Jun 2026 16:54:35 +0200
Subject: [PATCH 1/3] feat: Add scorer that exposes helpers to evaluate agents

---
 js/src/agent-assertions.test.ts | 138 ++++++++
 js/src/agent-assertions.ts      | 553 ++++++++++++++++++++++++++++++++
 js/src/exports.ts               |   2 +
 3 files changed, 693 insertions(+)
 create mode 100644 js/src/agent-assertions.test.ts
 create mode 100644 js/src/agent-assertions.ts

diff --git a/js/src/agent-assertions.test.ts b/js/src/agent-assertions.test.ts
new file mode 100644
index 000000000..3c4504d6d
--- /dev/null
+++ b/js/src/agent-assertions.test.ts
@@ -0,0 +1,138 @@
+import { beforeAll, expect, test, vi } from "vitest";
+import { z } from "zod";
+
+import { agentAssertionScorer, Eval } from "./exports";
+import { configureNode } from "./node/config";
+import type { AgentAssertionScoreMetadata } from "./agent-assertions";
+import type { Trace } from "./trace";
+import type { Score } from "../util";
+
+beforeAll(() => {
+  configureNode();
+});
+
+test("agentAssertionScorer emits one score with assertion metadata", async () => {
+  const scorer = agentAssertionScorer<
+    string,
+    { answer: string; count: number },
+    { answer: string }
+  >(
+    ({ output, expected, assert }) => [
+      assert.equals(output.answer, expected.answer, "answer matches"),
+      assert.equals(output.count, 3, "count is three"),
+      assert.contains(output.answer, /hi/i, "answer contains greeting"),
+      assert.matches(
+        output,
+        z.object({ answer: z.string(), count: z.number() }),
+        "output schema",
+      ),
+    ],
+    { name: "agent_contract" },
+  );
+
+  const score = (await scorer({
+    input: "hello",
+    expected: { answer: "hi" },
+    output: { answer: "hi", count: 2 },
+    metadata: {},
+  })) as Score;
+
+  expect(score.name).toBe("agent_contract");
+  expect(score.score).toBe(0.75);
+  expect(score.metadata).toEqual({
+    assertions: [
+      { name: "answer matches", passed: true },
+      { name: "count is three", passed: false },
+      { name: "answer contains greeting", passed: true },
+      { name: "output schema", passed: true },
+    ],
+    failed: ["count is three: expected 2 to equal 3"],
+  } satisfies AgentAssertionScoreMetadata);
+});
+
+test("agentAssertionScorer evaluates trace-backed tool assertions after collection", async () => {
+  const getSpans = vi.fn().mockResolvedValue([
+    {
+      input: { city: "Brooklyn" },
+      output: { forecast: "72F and sunny" },
+      span_attributes: { type: "tool", name: "tool: get_weather" },
+    },
+    {
+      input: { city: "Brooklyn" },
+      output: { source: "cache" },
+      span_attributes: { type: "tool", name: "lookup_cache" },
+    },
+  ]);
+  const trace: Trace = {
+    getConfiguration: () => ({
+      object_type: "experiment",
+      object_id: "experiment-id",
+      root_span_id: "root-span-id",
+    }),
+    getSpans,
+    getThread: vi.fn(),
+  };
+  const callbackOrder: string[] = [];
+  const scorer = agentAssertionScorer(({ assert }) => {
+    callbackOrder.push("callback");
+    return [
+      assert.calledTool("get_weather", {
+        input: { city: /Brook/ },
+        output: { forecast: /sunny/ },
+        times: 1,
+      }),
+      assert.calledTool("charge_card"),
+      assert.notCalledTool("refund_customer"),
+      assert.toolOrder(["get_weather", "lookup_cache"]),
+      assert.maxToolCalls(2),
+    ];
+  });
+
+  const score = (await scorer({
+    input: "weather",
+    output: "done",
+    metadata: {},
+    trace,
+  })) as Score;
+
+  expect(callbackOrder).toEqual(["callback"]);
+  expect(getSpans).toHaveBeenCalledWith({ spanType: ["tool"] });
+  expect(score.name).toBe("assertions");
+  expect(score.score).toBe(0.8);
+  expect(score.metadata).toEqual({
+    assertions: [
+      { name: "called tool get_weather", passed: true },
+      { name: "called tool charge_card", passed: false },
+      { name: "did not call tool refund_customer", passed: true },
+      { name: "tool order", passed: true },
+      { name: "at most 2 tool calls", passed: true },
+    ],
+    failed: [
+      'called tool charge_card: expected tool "charge_card" to be called; found 0 matching calls',
+    ],
+  } satisfies AgentAssertionScoreMetadata);
+});
+
+test("agentAssertionScorer works as an Eval scorer", async () => {
+  const result = await Eval(
+    "agent assertions",
+    {
+      data: [
+        { input: "hello", expected: "hello world" },
+        { input: "bye", expected: "bye world" },
+      ] as const,
+      task: (input) => `${input} world` as const,
+      scores: [
+        agentAssertionScorer(({ output, expected, assert }) => [
+          assert.equals(output, expected, "output matches expected"),
+          assert.contains(output, "world", "output contains world"),
+        ]),
+      ],
+    },
+    { noSendLogs: true },
+  );
+
+  expect(result.results[0].scores.assertions).toBe(1);
+  expect(result.results[1].scores.assertions).toBe(1);
+  expect(result.summary.scores.assertions.score).toBe(1);
+});
diff --git a/js/src/agent-assertions.ts b/js/src/agent-assertions.ts
new file mode 100644
index 000000000..b4edc597d
--- /dev/null
+++ b/js/src/agent-assertions.ts
@@ -0,0 +1,553 @@
+import type { EvalScorer, EvalScorerArgs } from "./framework";
+import type { SpanData } from "./trace";
+import type { BaseMetadata, DefaultMetadataType } from "./logger";
+import type { Score } from "../util";
+
+type MaybePromise<T> = T | Promise<T>;
+
+type AssertionMatcher =
+  | RegExp
+  | ((value: unknown) => boolean)
+  | readonly AssertionMatcher[]
+  | { [key: string]: AssertionMatcher }
+  | string
+  | number
+  | boolean
+  | null
+  | undefined;
+
+interface ToolCallAssertionOptions {
+  /**
+   * Match against the tool call input. Objects are matched partially, regular
+   * expressions are matched against stringified values, and functions are
+   * treated as predicates.
+   */
+  input?: AssertionMatcher;
+  /**
+   * Match against the tool call output. Objects are matched partially, regular
+   * expressions are matched against stringified values, and functions are
+   * treated as predicates.
+   */
+  output?: AssertionMatcher;
+  /** If set, require the matching tool call to have, or not have, an error. */
+  isError?: boolean;
+  /** If set, require exactly this many matching calls. */
+  times?: number;
+}
+
+interface AgentAssertion {
+  name: string;
+  evaluate: (resources: AgentAssertionResources) => MaybePromise<{
+    passed: boolean;
+    failure?: string;
+  }>;
+  requiresTrace?: boolean;
+}
+
+interface AgentAssertionHelpers {
+  /**
+   * Assert that two values are deeply equal.
+   *
+   * @param actual - The value produced by the task or derived by the scorer.
+   * @param expected - The value to compare against.
+   * @param name - Optional assertion name. Defaults to `"equals"`.
+   */
+  equals: (actual: unknown, expected: unknown, name?: string) => AgentAssertion;
+  /**
+   * Assert that two values are not deeply equal.
+   *
+   * @param actual - The value produced by the task or derived by the scorer.
+   * @param expected - The value that should not be returned.
+   * @param name - Optional assertion name. Defaults to `"not equals"`.
+   */
+  notEquals: (
+    actual: unknown,
+    expected: unknown,
+    name?: string,
+  ) => AgentAssertion;
+  /**
+   * Assert that a stringified value contains a substring or matches a regular
+   * expression.
+   *
+   * @param value - The value to inspect.
+   * @param expected - The substring or regular expression to find.
+   * @param name - Optional assertion name. Defaults to `"contains"`.
+   */
+  contains: (
+    value: unknown,
+    expected: string | RegExp,
+    name?: string,
+  ) => AgentAssertion;
+  /**
+   * Assert that a value matches a schema.
+   *
+   * Supports schemas with `safeParse` or `parse`, such as Zod, and Standard
+   * Schema `~standard.validate`, used by libraries such as Valibot and ArkType.
+   *
+   * @param value - The value to validate.
+   * @param schema - The schema to validate against.
+   * @param name - Optional assertion name. Defaults to `"matches schema"`.
+   */
+  matches: (
+    value: unknown,
+    schema: SchemaLike,
+    name?: string,
+  ) => AgentAssertion;
+  /**
+   * Assert that a tool was called, optionally constrained by input, output,
+   * error state, or exact call count.
+   *
+   * @param toolName - The tool name to find in trace spans.
+   * @param options - Optional constraints for matching calls.
+   * @param name - Optional assertion name. Defaults to `"called tool ${toolName}"`.
+   */
+  calledTool: (
+    toolName: string,
+    options?: ToolCallAssertionOptions,
+    name?: string,
+  ) => AgentAssertion;
+  /**
+   * Assert that a tool was not called.
+   *
+   * @param toolName - The tool name to reject in trace spans.
+   * @param name - Optional assertion name. Defaults to `"did not call tool ${toolName}"`.
+   */
+  notCalledTool: (toolName: string, name?: string) => AgentAssertion;
+  /**
+   * Assert that tools were called in the given relative order.
+   *
+   * The tools do not need to be adjacent. Extra tool calls between them are
+   * allowed.
+   *
+   * @param toolNames - The ordered list of tool names to find.
+   * @param name - Optional assertion name. Defaults to `"tool order"`.
+   */
+  toolOrder: (toolNames: string[], name?: string) => AgentAssertion;
+  /**
+   * Assert that the task made no tool calls.
+   *
+   * @param name - Optional assertion name. Defaults to `"used no tools"`.
+   */
+  usedNoTools: (name?: string) => AgentAssertion;
+  /**
+   * Assert that the task made no more than `max` tool calls.
+   *
+   * @param max - The maximum number of allowed tool calls.
+   * @param name - Optional assertion name. Defaults to `"at most ${max} tool calls"`.
+   */
+  maxToolCalls: (max: number, name?: string) => AgentAssertion;
+}
+
+type AgentAssertionScorerCallback<
+  Input,
+  Output,
+  Expected,
+  Metadata extends BaseMetadata = DefaultMetadataType,
+> = (
+  args: AgentAssertionScorerCallbackArgs<Input, Output, Expected, Metadata>,
+) => MaybePromise<AgentAssertion[]>;
+
+type AgentAssertionScorerCallbackArgs<
+  Input,
+  Output,
+  Expected,
+  Metadata extends BaseMetadata = DefaultMetadataType,
+> = {
+  input: Input;
+  output: Output;
+  /** Helpers for building assertions from Eval inputs, outputs, and traces. */
+  assert: AgentAssertionHelpers;
+} & (Expected extends void
+  ? { expected?: undefined }
+  : { expected: Expected }) &
+  (Metadata extends void ? { metadata?: undefined } : { metadata: Metadata });
+
+interface AgentAssertionResources {
+  spans?: SpanData[];
+}
+
+type SchemaLike =
+  | {
+      safeParse: (value: unknown) => {
+        success: boolean;
+        error?: unknown;
+      };
+    }
+  | {
+      parse: (value: unknown) => unknown;
+    }
+  | {
+      "~standard": {
+        validate: (value: unknown) => MaybePromise<unknown>;
+      };
+    };
+
+/**
+ * Create an Eval scorer that will evaluate an agent based on assertions on the
+ * generated trace.
+ *
+ * The callback receives `input`, `output`, `expected`, `metadata` plus an `assert`
+ * helper object. It should return the assertions to evaluate the agent against.
+ *
+ * **Important**: Tool-call assertions require Braintrust tracing to be set up
+ * during the Eval so the scorer can read tool spans from the trace.
+ *
+ * The score emitted by this scorer is the fraction of assertions that passed. If there are no
+ * assertions, the score is `1`. The score metadata includes every assertion's name and
+ * pass/fail state, plus human-readable failure messages.
+ *
+ * @example
+ * ```ts
+ * import { Eval, agentAssertionScorer } from "braintrust";
+ *
+ * await Eval("agent-eval", {
+ *   data: () => [{ input: "What is the capital of Estonia?" }],
+ *   task: () => ({ answer: `Tallinn is the capital of Estonia. ${input}` }),
+ *   scores: [
+ *     agentAssertionScorer(({ output, assert }) => [
+ *       assert.contains(output.answer, /Tallinn/i, "mentions Tallinn"),
+ *       assert.calledTool("web_search", { times: 1 }, "searched once"),
+ *       assert.maxToolCalls(3, "bounded tool use"),
+ *     ]),
+ *   ],
+ * });
+ * ```
+ */
+export function agentAssertionScorer<
+  Input,
+  Output,
+  Expected = void,
+  Metadata extends BaseMetadata = DefaultMetadataType,
+>(
+  callback: AgentAssertionScorerCallback<Input, Output, Expected, Metadata>,
+  options: {
+    /** The score name to emit. Defaults to `"assertions"`. */
+    name?: string;
+  } = {},
+): EvalScorer<Input, Output, Expected, Metadata> {
+  return async (args) => {
+    const callbackArgs = {
+      input: args.input,
+      output: args.output,
+      ...("expected" in args ? { expected: args.expected } : {}),
+      ...("metadata" in args ? { metadata: args.metadata } : {}),
+      assert: agentAssertionHelpers,
+    } as AgentAssertionScorerCallbackArgs<Input, Output, Expected, Metadata>;
+    const assertions = await callback(callbackArgs);
+    const resources: AgentAssertionResources = {};
+    if (assertions.some((assertion) => assertion.requiresTrace)) {
+      resources.spans = await args.trace?.getSpans({ spanType: ["tool"] });
+    }
+
+    const results = await Promise.all(
+      assertions.map(async (assertion) => {
+        const result = await assertion.evaluate(resources);
+        return {
+          name: assertion.name,
+          passed: result.passed,
+          failure: result.failure,
+        };
+      }),
+    );
+
+    const passed = results.filter((result) => result.passed).length;
+    const total = results.length;
+    const failed = results
+      .filter((result) => !result.passed)
+      .map(
+        (result) =>
+          `${result.name}: ${result.failure ?? "assertion did not pass"}`,
+      );
+
+    return {
+      name: options.name ?? "assertions",
+      score: total === 0 ? 1 : passed / total,
+      metadata: {
+        assertions: results.map(({ name, passed }) => ({ name, passed })),
+        failed,
+      },
+    } satisfies Score;
+  };
+}
+
+const agentAssertionHelpers: AgentAssertionHelpers = {
+  equals: (actual, expected, name = "equals") => ({
+    name,
+    evaluate: () => {
+      const passed = deepEqual(actual, expected);
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected ${formatValue(actual)} to equal ${formatValue(expected)}`,
+      };
+    },
+  }),
+  notEquals: (actual, expected, name = "not equals") => ({
+    name,
+    evaluate: () => {
+      const passed = !deepEqual(actual, expected);
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected ${formatValue(actual)} not to equal ${formatValue(expected)}`,
+      };
+    },
+  }),
+  contains: (value, expected, name = "contains") => ({
+    name,
+    evaluate: () => {
+      const stringValue = String(value);
+      const passed =
+        expected instanceof RegExp
+          ? expected.test(stringValue)
+          : stringValue.includes(expected);
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected ${formatValue(value)} to contain ${formatValue(expected)}`,
+      };
+    },
+  }),
+  matches: (value, schema, name = "matches schema") => ({
+    name,
+    evaluate: async () => {
+      const result = await validateSchema(schema, value);
+      return {
+        passed: result.passed,
+        failure: result.passed
+          ? undefined
+          : `expected value to match schema: ${result.message}`,
+      };
+    },
+  }),
+  calledTool: (toolName, options = {}, name = `called tool ${toolName}`) => ({
+    name,
+    requiresTrace: true,
+    evaluate: ({ spans }) => {
+      const calls = matchingToolCalls(spans ?? [], toolName, options);
+      const passed =
+        options.times === undefined
+          ? calls.length > 0
+          : calls.length === options.times;
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : options.times === undefined
+            ? `expected tool "${toolName}" to be called; found ${calls.length} matching call${calls.length === 1 ? "" : "s"}`
+            : `expected tool "${toolName}" to be called ${options.times} time${options.times === 1 ? "" : "s"}; found ${calls.length} matching call${calls.length === 1 ? "" : "s"}`,
+      };
+    },
+  }),
+  notCalledTool: (toolName, name = `did not call tool ${toolName}`) => ({
+    name,
+    requiresTrace: true,
+    evaluate: ({ spans }) => {
+      const calls = toolCalls(spans ?? []).filter(
+        (span) => getToolName(span) === toolName,
+      );
+      const passed = calls.length === 0;
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected tool "${toolName}" not to be called; found ${calls.length} call${calls.length === 1 ? "" : "s"}`,
+      };
+    },
+  }),
+  toolOrder: (toolNames, name = "tool order") => ({
+    name,
+    requiresTrace: true,
+    evaluate: ({ spans }) => {
+      const observed = toolCalls(spans ?? [])
+        .map(getToolName)
+        .filter((toolName) => toolName !== undefined);
+      let fromIndex = 0;
+      const passed = toolNames.every((toolName) => {
+        const index = observed.indexOf(toolName, fromIndex);
+        if (index === -1) return false;
+        fromIndex = index + 1;
+        return true;
+      });
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected tool order ${toolNames.join(" -> ")}; observed ${observed.join(" -> ") || "no tools"}`,
+      };
+    },
+  }),
+  usedNoTools: (name = "used no tools") => ({
+    name,
+    requiresTrace: true,
+    evaluate: ({ spans }) => {
+      const calls = toolCalls(spans ?? []);
+      const passed = calls.length === 0;
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected no tool calls; found ${calls.length}`,
+      };
+    },
+  }),
+  maxToolCalls: (max, name = `at most ${max} tool calls`) => ({
+    name,
+    requiresTrace: true,
+    evaluate: ({ spans }) => {
+      const calls = toolCalls(spans ?? []);
+      const passed = calls.length <= max;
+      return {
+        passed,
+        failure: passed
+          ? undefined
+          : `expected at most ${max} tool call${max === 1 ? "" : "s"}; found ${calls.length}`,
+      };
+    },
+  }),
+};
+
+async function validateSchema(
+  schema: SchemaLike,
+  value: unknown,
+): Promise<{ passed: boolean; message: string }> {
+  try {
+    if ("safeParse" in schema) {
+      const result = schema.safeParse(value);
+      return result.success
+        ? { passed: true, message: "" }
+        : { passed: false, message: formatSchemaError(result.error) };
+    }
+    if ("parse" in schema) {
+      schema.parse(value);
+      return { passed: true, message: "" };
+    }
+    const result = await schema["~standard"].validate(value);
+    if (
+      typeof result === "object" &&
+      result !== null &&
+      "issues" in result &&
+      Array.isArray(result.issues) &&
+      result.issues.length > 0
+    ) {
+      return { passed: false, message: formatValue(result.issues) };
+    }
+    return { passed: true, message: "" };
+  } catch (e) {
+    return { passed: false, message: formatSchemaError(e) };
+  }
+}
+
+function toolCalls(spans: SpanData[]) {
+  return spans.filter((span) => span.span_attributes?.type === "tool");
+}
+
+function matchingToolCalls(
+  spans: SpanData[],
+  toolName: string,
+  options: ToolCallAssertionOptions,
+) {
+  return toolCalls(spans).filter((span) => {
+    if (getToolName(span) !== toolName) return false;
+    if (
+      options.input !== undefined &&
+      !matchesValue(span.input, options.input)
+    ) {
+      return false;
+    }
+    if (
+      options.output !== undefined &&
+      !matchesValue(span.output, options.output)
+    ) {
+      return false;
+    }
+    if (
+      options.isError !== undefined &&
+      Boolean(span.error) !== options.isError
+    ) {
+      return false;
+    }
+    return true;
+  });
+}
+
+function getToolName(span: SpanData) {
+  const rawName =
+    typeof span.span_attributes?.name === "string"
+      ? span.span_attributes.name
+      : typeof span.name === "string"
+        ? span.name
+        : undefined;
+  if (!rawName) return undefined;
+  return rawName.startsWith("tool:")
+    ? rawName.slice("tool:".length).trim()
+    : rawName;
+}
+
+function matchesValue(actual: unknown, matcher: AssertionMatcher): boolean {
+  if (matcher instanceof RegExp) {
+    return matcher.test(String(actual));
+  }
+  if (typeof matcher === "function") {
+    return matcher(actual);
+  }
+  if (isPlainObject(matcher) && isPlainObject(actual)) {
+    return Object.entries(matcher).every(([key, value]) =>
+      matchesValue(actual[key], value),
+    );
+  }
+  return deepEqual(actual, matcher);
+}
+
+function deepEqual(left: unknown, right: unknown): boolean {
+  if (Object.is(left, right)) return true;
+  if (Array.isArray(left) && Array.isArray(right)) {
+    return (
+      left.length === right.length &&
+      left.every((item, index) => deepEqual(item, right[index]))
+    );
+  }
+  if (isPlainObject(left) && isPlainObject(right)) {
+    const leftKeys = Object.keys(left);
+    const rightKeys = Object.keys(right);
+    return (
+      leftKeys.length === rightKeys.length &&
+      leftKeys.every((key) => deepEqual(left[key], right[key]))
+    );
+  }
+  return false;
+}
+
+function isPlainObject(value: unknown): value is Record<string, unknown> {
+  return (
+    typeof value === "object" &&
+    value !== null &&
+    !Array.isArray(value) &&
+    Object.getPrototypeOf(value) === Object.prototype
+  );
+}
+
+function formatSchemaError(error: unknown) {
+  if (error instanceof Error) {
+    return error.message;
+  }
+  return formatValue(error);
+}
+
+function formatValue(value: unknown) {
+  if (value instanceof RegExp) {
+    return value.toString();
+  }
+  if (typeof value === "string") {
+    return JSON.stringify(value);
+  }
+  try {
+    const serialized = JSON.stringify(value);
+    return serialized === undefined ? String(value) : serialized;
+  } catch {
+    return String(value);
+  }
+}
diff --git a/js/src/exports.ts b/js/src/exports.ts
index 54cd03e2a..dfc759134 100644
--- a/js/src/exports.ts
+++ b/js/src/exports.ts
@@ -230,6 +230,8 @@ export {
   defaultErrorScoreHandler,
 } from "./framework";
 
+export { agentAssertionScorer } from "./agent-assertions";
+
 export { DatasetPipeline } from "./dataset-pipeline";
 
 export type {

From 1e9dc26b849b2d2a4cc06066a0fa56adf3bf0b95 Mon Sep 17 00:00:00 2001
From: Luca Forstner <luca.forstner@gmail.com>
Date: Fri, 19 Jun 2026 17:07:05 +0200
Subject: [PATCH 2/3] cs

---
 .changeset/sweet-pears-pay.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/sweet-pears-pay.md

diff --git a/.changeset/sweet-pears-pay.md b/.changeset/sweet-pears-pay.md
new file mode 100644
index 000000000..8e07ed991
--- /dev/null
+++ b/.changeset/sweet-pears-pay.md
@@ -0,0 +1,5 @@
+---
+"braintrust": minor
+---
+
+feat: Add scorer that exposes helpers to evaluate agents

From 64362eb7dbaebf0f8e8d34a86ad736f6ccd456a4 Mon Sep 17 00:00:00 2001
From: Luca Forstner <luca.forstner@gmail.com>
Date: Fri, 19 Jun 2026 17:29:42 +0200
Subject: [PATCH 3/3] fixes

---
 js/src/agent-assertions.ts | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/js/src/agent-assertions.ts b/js/src/agent-assertions.ts
index b4edc597d..635bf351e 100644
--- a/js/src/agent-assertions.ts
+++ b/js/src/agent-assertions.ts
@@ -144,24 +144,12 @@ type AgentAssertionScorerCallback<
   Expected,
   Metadata extends BaseMetadata = DefaultMetadataType,
 > = (
-  args: AgentAssertionScorerCallbackArgs<Input, Output, Expected, Metadata>,
+  args: Omit<EvalScorerArgs<Input, Output, Expected, Metadata>, "trace"> & {
+    /** Helpers for building assertions from Eval inputs, outputs, and traces. */
+    assert: AgentAssertionHelpers;
+  },
 ) => MaybePromise<AgentAssertion[]>;
 
-type AgentAssertionScorerCallbackArgs<
-  Input,
-  Output,
-  Expected,
-  Metadata extends BaseMetadata = DefaultMetadataType,
-> = {
-  input: Input;
-  output: Output;
-  /** Helpers for building assertions from Eval inputs, outputs, and traces. */
-  assert: AgentAssertionHelpers;
-} & (Expected extends void
-  ? { expected?: undefined }
-  : { expected: Expected }) &
-  (Metadata extends void ? { metadata?: undefined } : { metadata: Metadata });
-
 interface AgentAssertionResources {
   spans?: SpanData[];
 }
@@ -202,7 +190,7 @@ type SchemaLike =
  *
  * await Eval("agent-eval", {
  *   data: () => [{ input: "What is the capital of Estonia?" }],
- *   task: () => ({ answer: `Tallinn is the capital of Estonia. ${input}` }),
+ *   task: async () => ({ answer: "Tallinn is the capital of Estonia." }),
  *   scores: [
  *     agentAssertionScorer(({ output, assert }) => [
  *       assert.contains(output.answer, /Tallinn/i, "mentions Tallinn"),
@@ -226,14 +214,11 @@ export function agentAssertionScorer<
   } = {},
 ): EvalScorer<Input, Output, Expected, Metadata> {
   return async (args) => {
-    const callbackArgs = {
-      input: args.input,
-      output: args.output,
-      ...("expected" in args ? { expected: args.expected } : {}),
-      ...("metadata" in args ? { metadata: args.metadata } : {}),
+    const { trace: _trace, ...callbackArgs } = args;
+    const assertions = await callback({
+      ...callbackArgs,
       assert: agentAssertionHelpers,
-    } as AgentAssertionScorerCallbackArgs<Input, Output, Expected, Metadata>;
-    const assertions = await callback(callbackArgs);
+    });
     const resources: AgentAssertionResources = {};
     if (assertions.some((assertion) => assertion.requiresTrace)) {
       resources.spans = await args.trace?.getSpans({ spanType: ["tool"] });