From 0e0006458028388570a2efc1142b9aff11a2474f Mon Sep 17 00:00:00 2001 From: Luca Forstner Date: Fri, 19 Jun 2026 16:54:35 +0200 Subject: [PATCH 1/3] feat: Add scorer that exposes helpers to evaluate agents --- js/src/agent-assertions.test.ts | 138 ++++++++ js/src/agent-assertions.ts | 553 ++++++++++++++++++++++++++++++++ js/src/exports.ts | 2 + 3 files changed, 693 insertions(+) create mode 100644 js/src/agent-assertions.test.ts create mode 100644 js/src/agent-assertions.ts diff --git a/js/src/agent-assertions.test.ts b/js/src/agent-assertions.test.ts new file mode 100644 index 000000000..3c4504d6d --- /dev/null +++ b/js/src/agent-assertions.test.ts @@ -0,0 +1,138 @@ +import { beforeAll, expect, test, vi } from "vitest"; +import { z } from "zod"; + +import { agentAssertionScorer, Eval } from "./exports"; +import { configureNode } from "./node/config"; +import type { AgentAssertionScoreMetadata } from "./agent-assertions"; +import type { Trace } from "./trace"; +import type { Score } from "../util"; + +beforeAll(() => { + configureNode(); +}); + +test("agentAssertionScorer emits one score with assertion metadata", async () => { + const scorer = agentAssertionScorer< + string, + { answer: string; count: number }, + { answer: string } + >( + ({ output, expected, assert }) => [ + assert.equals(output.answer, expected.answer, "answer matches"), + assert.equals(output.count, 3, "count is three"), + assert.contains(output.answer, /hi/i, "answer contains greeting"), + assert.matches( + output, + z.object({ answer: z.string(), count: z.number() }), + "output schema", + ), + ], + { name: "agent_contract" }, + ); + + const score = (await scorer({ + input: "hello", + expected: { answer: "hi" }, + output: { answer: "hi", count: 2 }, + metadata: {}, + })) as Score; + + expect(score.name).toBe("agent_contract"); + expect(score.score).toBe(0.75); + expect(score.metadata).toEqual({ + assertions: [ + { name: "answer matches", passed: true }, + { name: "count is three", passed: false }, + { name: "answer contains greeting", passed: true }, + { name: "output schema", passed: true }, + ], + failed: ["count is three: expected 2 to equal 3"], + } satisfies AgentAssertionScoreMetadata); +}); + +test("agentAssertionScorer evaluates trace-backed tool assertions after collection", async () => { + const getSpans = vi.fn().mockResolvedValue([ + { + input: { city: "Brooklyn" }, + output: { forecast: "72F and sunny" }, + span_attributes: { type: "tool", name: "tool: get_weather" }, + }, + { + input: { city: "Brooklyn" }, + output: { source: "cache" }, + span_attributes: { type: "tool", name: "lookup_cache" }, + }, + ]); + const trace: Trace = { + getConfiguration: () => ({ + object_type: "experiment", + object_id: "experiment-id", + root_span_id: "root-span-id", + }), + getSpans, + getThread: vi.fn(), + }; + const callbackOrder: string[] = []; + const scorer = agentAssertionScorer(({ assert }) => { + callbackOrder.push("callback"); + return [ + assert.calledTool("get_weather", { + input: { city: /Brook/ }, + output: { forecast: /sunny/ }, + times: 1, + }), + assert.calledTool("charge_card"), + assert.notCalledTool("refund_customer"), + assert.toolOrder(["get_weather", "lookup_cache"]), + assert.maxToolCalls(2), + ]; + }); + + const score = (await scorer({ + input: "weather", + output: "done", + metadata: {}, + trace, + })) as Score; + + expect(callbackOrder).toEqual(["callback"]); + expect(getSpans).toHaveBeenCalledWith({ spanType: ["tool"] }); + expect(score.name).toBe("assertions"); + expect(score.score).toBe(0.8); + expect(score.metadata).toEqual({ + assertions: [ + { name: "called tool get_weather", passed: true }, + { name: "called tool charge_card", passed: false }, + { name: "did not call tool refund_customer", passed: true }, + { name: "tool order", passed: true }, + { name: "at most 2 tool calls", passed: true }, + ], + failed: [ + 'called tool charge_card: expected tool "charge_card" to be called; found 0 matching calls', + ], + } satisfies AgentAssertionScoreMetadata); +}); + +test("agentAssertionScorer works as an Eval scorer", async () => { + const result = await Eval( + "agent assertions", + { + data: [ + { input: "hello", expected: "hello world" }, + { input: "bye", expected: "bye world" }, + ] as const, + task: (input) => `${input} world` as const, + scores: [ + agentAssertionScorer(({ output, expected, assert }) => [ + assert.equals(output, expected, "output matches expected"), + assert.contains(output, "world", "output contains world"), + ]), + ], + }, + { noSendLogs: true }, + ); + + expect(result.results[0].scores.assertions).toBe(1); + expect(result.results[1].scores.assertions).toBe(1); + expect(result.summary.scores.assertions.score).toBe(1); +}); diff --git a/js/src/agent-assertions.ts b/js/src/agent-assertions.ts new file mode 100644 index 000000000..b4edc597d --- /dev/null +++ b/js/src/agent-assertions.ts @@ -0,0 +1,553 @@ +import type { EvalScorer, EvalScorerArgs } from "./framework"; +import type { SpanData } from "./trace"; +import type { BaseMetadata, DefaultMetadataType } from "./logger"; +import type { Score } from "../util"; + +type MaybePromise = T | Promise; + +type AssertionMatcher = + | RegExp + | ((value: unknown) => boolean) + | readonly AssertionMatcher[] + | { [key: string]: AssertionMatcher } + | string + | number + | boolean + | null + | undefined; + +interface ToolCallAssertionOptions { + /** + * Match against the tool call input. Objects are matched partially, regular + * expressions are matched against stringified values, and functions are + * treated as predicates. + */ + input?: AssertionMatcher; + /** + * Match against the tool call output. Objects are matched partially, regular + * expressions are matched against stringified values, and functions are + * treated as predicates. + */ + output?: AssertionMatcher; + /** If set, require the matching tool call to have, or not have, an error. */ + isError?: boolean; + /** If set, require exactly this many matching calls. */ + times?: number; +} + +interface AgentAssertion { + name: string; + evaluate: (resources: AgentAssertionResources) => MaybePromise<{ + passed: boolean; + failure?: string; + }>; + requiresTrace?: boolean; +} + +interface AgentAssertionHelpers { + /** + * Assert that two values are deeply equal. + * + * @param actual - The value produced by the task or derived by the scorer. + * @param expected - The value to compare against. + * @param name - Optional assertion name. Defaults to `"equals"`. + */ + equals: (actual: unknown, expected: unknown, name?: string) => AgentAssertion; + /** + * Assert that two values are not deeply equal. + * + * @param actual - The value produced by the task or derived by the scorer. + * @param expected - The value that should not be returned. + * @param name - Optional assertion name. Defaults to `"not equals"`. + */ + notEquals: ( + actual: unknown, + expected: unknown, + name?: string, + ) => AgentAssertion; + /** + * Assert that a stringified value contains a substring or matches a regular + * expression. + * + * @param value - The value to inspect. + * @param expected - The substring or regular expression to find. + * @param name - Optional assertion name. Defaults to `"contains"`. + */ + contains: ( + value: unknown, + expected: string | RegExp, + name?: string, + ) => AgentAssertion; + /** + * Assert that a value matches a schema. + * + * Supports schemas with `safeParse` or `parse`, such as Zod, and Standard + * Schema `~standard.validate`, used by libraries such as Valibot and ArkType. + * + * @param value - The value to validate. + * @param schema - The schema to validate against. + * @param name - Optional assertion name. Defaults to `"matches schema"`. + */ + matches: ( + value: unknown, + schema: SchemaLike, + name?: string, + ) => AgentAssertion; + /** + * Assert that a tool was called, optionally constrained by input, output, + * error state, or exact call count. + * + * @param toolName - The tool name to find in trace spans. + * @param options - Optional constraints for matching calls. + * @param name - Optional assertion name. Defaults to `"called tool ${toolName}"`. + */ + calledTool: ( + toolName: string, + options?: ToolCallAssertionOptions, + name?: string, + ) => AgentAssertion; + /** + * Assert that a tool was not called. + * + * @param toolName - The tool name to reject in trace spans. + * @param name - Optional assertion name. Defaults to `"did not call tool ${toolName}"`. + */ + notCalledTool: (toolName: string, name?: string) => AgentAssertion; + /** + * Assert that tools were called in the given relative order. + * + * The tools do not need to be adjacent. Extra tool calls between them are + * allowed. + * + * @param toolNames - The ordered list of tool names to find. + * @param name - Optional assertion name. Defaults to `"tool order"`. + */ + toolOrder: (toolNames: string[], name?: string) => AgentAssertion; + /** + * Assert that the task made no tool calls. + * + * @param name - Optional assertion name. Defaults to `"used no tools"`. + */ + usedNoTools: (name?: string) => AgentAssertion; + /** + * Assert that the task made no more than `max` tool calls. + * + * @param max - The maximum number of allowed tool calls. + * @param name - Optional assertion name. Defaults to `"at most ${max} tool calls"`. + */ + maxToolCalls: (max: number, name?: string) => AgentAssertion; +} + +type AgentAssertionScorerCallback< + Input, + Output, + Expected, + Metadata extends BaseMetadata = DefaultMetadataType, +> = ( + args: AgentAssertionScorerCallbackArgs, +) => MaybePromise; + +type AgentAssertionScorerCallbackArgs< + Input, + Output, + Expected, + Metadata extends BaseMetadata = DefaultMetadataType, +> = { + input: Input; + output: Output; + /** Helpers for building assertions from Eval inputs, outputs, and traces. */ + assert: AgentAssertionHelpers; +} & (Expected extends void + ? { expected?: undefined } + : { expected: Expected }) & + (Metadata extends void ? { metadata?: undefined } : { metadata: Metadata }); + +interface AgentAssertionResources { + spans?: SpanData[]; +} + +type SchemaLike = + | { + safeParse: (value: unknown) => { + success: boolean; + error?: unknown; + }; + } + | { + parse: (value: unknown) => unknown; + } + | { + "~standard": { + validate: (value: unknown) => MaybePromise; + }; + }; + +/** + * Create an Eval scorer that will evaluate an agent based on assertions on the + * generated trace. + * + * The callback receives `input`, `output`, `expected`, `metadata` plus an `assert` + * helper object. It should return the assertions to evaluate the agent against. + * + * **Important**: Tool-call assertions require Braintrust tracing to be set up + * during the Eval so the scorer can read tool spans from the trace. + * + * The score emitted by this scorer is the fraction of assertions that passed. If there are no + * assertions, the score is `1`. The score metadata includes every assertion's name and + * pass/fail state, plus human-readable failure messages. + * + * @example + * ```ts + * import { Eval, agentAssertionScorer } from "braintrust"; + * + * await Eval("agent-eval", { + * data: () => [{ input: "What is the capital of Estonia?" }], + * task: () => ({ answer: `Tallinn is the capital of Estonia. ${input}` }), + * scores: [ + * agentAssertionScorer(({ output, assert }) => [ + * assert.contains(output.answer, /Tallinn/i, "mentions Tallinn"), + * assert.calledTool("web_search", { times: 1 }, "searched once"), + * assert.maxToolCalls(3, "bounded tool use"), + * ]), + * ], + * }); + * ``` + */ +export function agentAssertionScorer< + Input, + Output, + Expected = void, + Metadata extends BaseMetadata = DefaultMetadataType, +>( + callback: AgentAssertionScorerCallback, + options: { + /** The score name to emit. Defaults to `"assertions"`. */ + name?: string; + } = {}, +): EvalScorer { + return async (args) => { + const callbackArgs = { + input: args.input, + output: args.output, + ...("expected" in args ? { expected: args.expected } : {}), + ...("metadata" in args ? { metadata: args.metadata } : {}), + assert: agentAssertionHelpers, + } as AgentAssertionScorerCallbackArgs; + const assertions = await callback(callbackArgs); + const resources: AgentAssertionResources = {}; + if (assertions.some((assertion) => assertion.requiresTrace)) { + resources.spans = await args.trace?.getSpans({ spanType: ["tool"] }); + } + + const results = await Promise.all( + assertions.map(async (assertion) => { + const result = await assertion.evaluate(resources); + return { + name: assertion.name, + passed: result.passed, + failure: result.failure, + }; + }), + ); + + const passed = results.filter((result) => result.passed).length; + const total = results.length; + const failed = results + .filter((result) => !result.passed) + .map( + (result) => + `${result.name}: ${result.failure ?? "assertion did not pass"}`, + ); + + return { + name: options.name ?? "assertions", + score: total === 0 ? 1 : passed / total, + metadata: { + assertions: results.map(({ name, passed }) => ({ name, passed })), + failed, + }, + } satisfies Score; + }; +} + +const agentAssertionHelpers: AgentAssertionHelpers = { + equals: (actual, expected, name = "equals") => ({ + name, + evaluate: () => { + const passed = deepEqual(actual, expected); + return { + passed, + failure: passed + ? undefined + : `expected ${formatValue(actual)} to equal ${formatValue(expected)}`, + }; + }, + }), + notEquals: (actual, expected, name = "not equals") => ({ + name, + evaluate: () => { + const passed = !deepEqual(actual, expected); + return { + passed, + failure: passed + ? undefined + : `expected ${formatValue(actual)} not to equal ${formatValue(expected)}`, + }; + }, + }), + contains: (value, expected, name = "contains") => ({ + name, + evaluate: () => { + const stringValue = String(value); + const passed = + expected instanceof RegExp + ? expected.test(stringValue) + : stringValue.includes(expected); + return { + passed, + failure: passed + ? undefined + : `expected ${formatValue(value)} to contain ${formatValue(expected)}`, + }; + }, + }), + matches: (value, schema, name = "matches schema") => ({ + name, + evaluate: async () => { + const result = await validateSchema(schema, value); + return { + passed: result.passed, + failure: result.passed + ? undefined + : `expected value to match schema: ${result.message}`, + }; + }, + }), + calledTool: (toolName, options = {}, name = `called tool ${toolName}`) => ({ + name, + requiresTrace: true, + evaluate: ({ spans }) => { + const calls = matchingToolCalls(spans ?? [], toolName, options); + const passed = + options.times === undefined + ? calls.length > 0 + : calls.length === options.times; + return { + passed, + failure: passed + ? undefined + : options.times === undefined + ? `expected tool "${toolName}" to be called; found ${calls.length} matching call${calls.length === 1 ? "" : "s"}` + : `expected tool "${toolName}" to be called ${options.times} time${options.times === 1 ? "" : "s"}; found ${calls.length} matching call${calls.length === 1 ? "" : "s"}`, + }; + }, + }), + notCalledTool: (toolName, name = `did not call tool ${toolName}`) => ({ + name, + requiresTrace: true, + evaluate: ({ spans }) => { + const calls = toolCalls(spans ?? []).filter( + (span) => getToolName(span) === toolName, + ); + const passed = calls.length === 0; + return { + passed, + failure: passed + ? undefined + : `expected tool "${toolName}" not to be called; found ${calls.length} call${calls.length === 1 ? "" : "s"}`, + }; + }, + }), + toolOrder: (toolNames, name = "tool order") => ({ + name, + requiresTrace: true, + evaluate: ({ spans }) => { + const observed = toolCalls(spans ?? []) + .map(getToolName) + .filter((toolName) => toolName !== undefined); + let fromIndex = 0; + const passed = toolNames.every((toolName) => { + const index = observed.indexOf(toolName, fromIndex); + if (index === -1) return false; + fromIndex = index + 1; + return true; + }); + return { + passed, + failure: passed + ? undefined + : `expected tool order ${toolNames.join(" -> ")}; observed ${observed.join(" -> ") || "no tools"}`, + }; + }, + }), + usedNoTools: (name = "used no tools") => ({ + name, + requiresTrace: true, + evaluate: ({ spans }) => { + const calls = toolCalls(spans ?? []); + const passed = calls.length === 0; + return { + passed, + failure: passed + ? undefined + : `expected no tool calls; found ${calls.length}`, + }; + }, + }), + maxToolCalls: (max, name = `at most ${max} tool calls`) => ({ + name, + requiresTrace: true, + evaluate: ({ spans }) => { + const calls = toolCalls(spans ?? []); + const passed = calls.length <= max; + return { + passed, + failure: passed + ? undefined + : `expected at most ${max} tool call${max === 1 ? "" : "s"}; found ${calls.length}`, + }; + }, + }), +}; + +async function validateSchema( + schema: SchemaLike, + value: unknown, +): Promise<{ passed: boolean; message: string }> { + try { + if ("safeParse" in schema) { + const result = schema.safeParse(value); + return result.success + ? { passed: true, message: "" } + : { passed: false, message: formatSchemaError(result.error) }; + } + if ("parse" in schema) { + schema.parse(value); + return { passed: true, message: "" }; + } + const result = await schema["~standard"].validate(value); + if ( + typeof result === "object" && + result !== null && + "issues" in result && + Array.isArray(result.issues) && + result.issues.length > 0 + ) { + return { passed: false, message: formatValue(result.issues) }; + } + return { passed: true, message: "" }; + } catch (e) { + return { passed: false, message: formatSchemaError(e) }; + } +} + +function toolCalls(spans: SpanData[]) { + return spans.filter((span) => span.span_attributes?.type === "tool"); +} + +function matchingToolCalls( + spans: SpanData[], + toolName: string, + options: ToolCallAssertionOptions, +) { + return toolCalls(spans).filter((span) => { + if (getToolName(span) !== toolName) return false; + if ( + options.input !== undefined && + !matchesValue(span.input, options.input) + ) { + return false; + } + if ( + options.output !== undefined && + !matchesValue(span.output, options.output) + ) { + return false; + } + if ( + options.isError !== undefined && + Boolean(span.error) !== options.isError + ) { + return false; + } + return true; + }); +} + +function getToolName(span: SpanData) { + const rawName = + typeof span.span_attributes?.name === "string" + ? span.span_attributes.name + : typeof span.name === "string" + ? span.name + : undefined; + if (!rawName) return undefined; + return rawName.startsWith("tool:") + ? rawName.slice("tool:".length).trim() + : rawName; +} + +function matchesValue(actual: unknown, matcher: AssertionMatcher): boolean { + if (matcher instanceof RegExp) { + return matcher.test(String(actual)); + } + if (typeof matcher === "function") { + return matcher(actual); + } + if (isPlainObject(matcher) && isPlainObject(actual)) { + return Object.entries(matcher).every(([key, value]) => + matchesValue(actual[key], value), + ); + } + return deepEqual(actual, matcher); +} + +function deepEqual(left: unknown, right: unknown): boolean { + if (Object.is(left, right)) return true; + if (Array.isArray(left) && Array.isArray(right)) { + return ( + left.length === right.length && + left.every((item, index) => deepEqual(item, right[index])) + ); + } + if (isPlainObject(left) && isPlainObject(right)) { + const leftKeys = Object.keys(left); + const rightKeys = Object.keys(right); + return ( + leftKeys.length === rightKeys.length && + leftKeys.every((key) => deepEqual(left[key], right[key])) + ); + } + return false; +} + +function isPlainObject(value: unknown): value is Record { + return ( + typeof value === "object" && + value !== null && + !Array.isArray(value) && + Object.getPrototypeOf(value) === Object.prototype + ); +} + +function formatSchemaError(error: unknown) { + if (error instanceof Error) { + return error.message; + } + return formatValue(error); +} + +function formatValue(value: unknown) { + if (value instanceof RegExp) { + return value.toString(); + } + if (typeof value === "string") { + return JSON.stringify(value); + } + try { + const serialized = JSON.stringify(value); + return serialized === undefined ? String(value) : serialized; + } catch { + return String(value); + } +} diff --git a/js/src/exports.ts b/js/src/exports.ts index 54cd03e2a..dfc759134 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -230,6 +230,8 @@ export { defaultErrorScoreHandler, } from "./framework"; +export { agentAssertionScorer } from "./agent-assertions"; + export { DatasetPipeline } from "./dataset-pipeline"; export type { From 1e9dc26b849b2d2a4cc06066a0fa56adf3bf0b95 Mon Sep 17 00:00:00 2001 From: Luca Forstner Date: Fri, 19 Jun 2026 17:07:05 +0200 Subject: [PATCH 2/3] cs --- .changeset/sweet-pears-pay.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/sweet-pears-pay.md diff --git a/.changeset/sweet-pears-pay.md b/.changeset/sweet-pears-pay.md new file mode 100644 index 000000000..8e07ed991 --- /dev/null +++ b/.changeset/sweet-pears-pay.md @@ -0,0 +1,5 @@ +--- +"braintrust": minor +--- + +feat: Add scorer that exposes helpers to evaluate agents From 64362eb7dbaebf0f8e8d34a86ad736f6ccd456a4 Mon Sep 17 00:00:00 2001 From: Luca Forstner Date: Fri, 19 Jun 2026 17:29:42 +0200 Subject: [PATCH 3/3] fixes --- js/src/agent-assertions.ts | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/js/src/agent-assertions.ts b/js/src/agent-assertions.ts index b4edc597d..635bf351e 100644 --- a/js/src/agent-assertions.ts +++ b/js/src/agent-assertions.ts @@ -144,24 +144,12 @@ type AgentAssertionScorerCallback< Expected, Metadata extends BaseMetadata = DefaultMetadataType, > = ( - args: AgentAssertionScorerCallbackArgs, + args: Omit, "trace"> & { + /** Helpers for building assertions from Eval inputs, outputs, and traces. */ + assert: AgentAssertionHelpers; + }, ) => MaybePromise; -type AgentAssertionScorerCallbackArgs< - Input, - Output, - Expected, - Metadata extends BaseMetadata = DefaultMetadataType, -> = { - input: Input; - output: Output; - /** Helpers for building assertions from Eval inputs, outputs, and traces. */ - assert: AgentAssertionHelpers; -} & (Expected extends void - ? { expected?: undefined } - : { expected: Expected }) & - (Metadata extends void ? { metadata?: undefined } : { metadata: Metadata }); - interface AgentAssertionResources { spans?: SpanData[]; } @@ -202,7 +190,7 @@ type SchemaLike = * * await Eval("agent-eval", { * data: () => [{ input: "What is the capital of Estonia?" }], - * task: () => ({ answer: `Tallinn is the capital of Estonia. ${input}` }), + * task: async () => ({ answer: "Tallinn is the capital of Estonia." }), * scores: [ * agentAssertionScorer(({ output, assert }) => [ * assert.contains(output.answer, /Tallinn/i, "mentions Tallinn"), @@ -226,14 +214,11 @@ export function agentAssertionScorer< } = {}, ): EvalScorer { return async (args) => { - const callbackArgs = { - input: args.input, - output: args.output, - ...("expected" in args ? { expected: args.expected } : {}), - ...("metadata" in args ? { metadata: args.metadata } : {}), + const { trace: _trace, ...callbackArgs } = args; + const assertions = await callback({ + ...callbackArgs, assert: agentAssertionHelpers, - } as AgentAssertionScorerCallbackArgs; - const assertions = await callback(callbackArgs); + }); const resources: AgentAssertionResources = {}; if (assertions.some((assertion) => assertion.requiresTrace)) { resources.spans = await args.trace?.getSpans({ spanType: ["tool"] });