diff --git a/examples/optimization/eval_optimize_loop/.gitignore b/examples/optimization/eval_optimize_loop/.gitignore new file mode 100644 index 0000000..30c3418 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/.gitignore @@ -0,0 +1,8 @@ +# Everything below is regenerated on each run. A frozen sample report is +# committed under sample_output/ so reviewers can see the expected shape +# without run-to-run diff churn. +__pycache__/ +_sdk_eval_metrics.json +runs/ +optimization_report.json +optimization_report.md diff --git a/examples/optimization/eval_optimize_loop/README.md b/examples/optimization/eval_optimize_loop/README.md new file mode 100644 index 0000000..550f1ab --- /dev/null +++ b/examples/optimization/eval_optimize_loop/README.md @@ -0,0 +1,149 @@ +# Evaluation + Optimization Loop + +## 1. Purpose + +This example implements the issue requirement for a reproducible Evaluation + Optimization pipeline. It is not only an `AgentOptimizer` quickstart: it wraps optimization with baseline evaluation, failure attribution, validation regression, gate decisions, and audit artifacts. + +The default `fake` mode runs without model credentials. The `live` mode uses a real `LlmAgent` bridge and invokes `AgentOptimizer.optimize` against a `TargetPrompt`. + +## 2. Pipeline Stages + +The pipeline runs six stages: + +1. Baseline evaluation: score train and validation sets separately, including metric scores, pass/fail, reasons, and key trace fields. +2. Failure attribution: cluster failures into `final_response_mismatch`, `tool_call_error`, `parameter_error`, `llm_rubric_not_met`, `knowledge_recall_insufficient`, and `format_error`. +3. Optimization execution: fake mode applies a deterministic candidate; live mode calls `AgentOptimizer.optimize` with `TargetPrompt.add_path("system_prompt", ...)`. +4. Candidate validation: rerun train and validation sets and compute per-case deltas such as `new_pass`, `new_fail`, `score_up`, and `score_down`. +5. Acceptance gate: require validation gain, no new hard fail, no key-case regression, no train-up/validation-down overfit, and cost within budget. +6. Audit persistence: write prompt snapshots, scores, deltas, gate reasons, cost, duration, and config snapshots into an append-only per-run directory. + +## 3. Directory Layout + +```text +examples/optimization/eval_optimize_loop/ +├── agent/ +│ ├── __init__.py +│ └── agent.py +├── prompts/ +│ └── system.md +├── sample_output/ +│ ├── optimization_report.sample.json +│ └── optimization_report.sample.md +├── tests/ +│ ├── __init__.py +│ └── test_pipeline_units.py +├── train.evalset.json +├── val.evalset.json +├── case_meta.json +├── optimizer.json +├── optimizer.sdk.json +└── run.py +``` + +## 4. Inputs + +- `train.evalset.json`: training evaluation set. +- `val.evalset.json`: validation evaluation set; it must be a different file from train. +- `optimizer.json`: outer-loop configuration for mode, metric weights, fake candidate patch, and gate thresholds. It is validated at startup (`validate_config`): the three metric weights must sum to 1.0 and all gate keys must be present. +- `prompts/system.md`: baseline prompt source registered as the optimization target. +- `case_meta.json`: out-of-schema metadata for key cases, rubric kinds, and attribution hints. The `category` field declares the expected failure category per case; the report's attribution self-check measures rule-based attribution accuracy against it. +- `optimizer.sdk.json`: live-only SDK optimizer config passed to `AgentOptimizer.optimize`. This is also where the GEPA `seed` and the in-run spend cap (`max_metric_calls`) live. + +## 5. Outputs + +All outputs are generated at runtime and gitignored; a frozen sample report is committed under `sample_output/` for reference. + +- `runs/_/`: append-only per-run audit directory containing `baseline_prompt.md`, `candidate_prompt.md`, `optimization_report.json`, `optimization_report.md`, and (live only) raw SDK artifacts under `agent_optimizer/` (`RoundRecord`-backed round files, `result.json`, `summary.txt`, `best_prompts/`). +- `runs/latest/`: convenience mirror of the newest run directory. +- `optimization_report.json` / `optimization_report.md`: convenience copies of the newest report at the example root. + +The JSON report records baseline/candidate per-case scores and traces, per-case deltas, failure attribution with an accuracy self-check against the expected categories in `case_meta.json`, every gate check with its reason, decision, cost split into optimizer and evaluation spend, token counts, duration, prompt SHA-256 hashes, the GEPA seed, and a full config snapshot. + +## 6. Run Modes + +Fake mode (no credentials, deterministic): + +```bash +python examples/optimization/eval_optimize_loop/run.py --mode fake +``` + +Live mode: + +```bash +# Linux / macOS +export TRPC_AGENT_API_KEY=... +export TRPC_AGENT_BASE_URL=... +export TRPC_AGENT_MODEL_NAME=... + +# Windows (PowerShell): $env:TRPC_AGENT_API_KEY = "..."; etc. +python examples/optimization/eval_optimize_loop/run.py --mode live +``` + +`fake` mode uses a deterministic fake model, fake judge, and scripted candidate so the full loop runs without API keys and with zero network calls. `live` mode uses `agent/agent.py`, creates a fresh `LlmAgent` for each call, and invokes `AgentOptimizer.optimize`. + +Environment variables: + +| Variable | Default | Meaning | +|----------|---------|---------| +| `EVAL_OPT_LOG_LEVEL` | `INFO` | Log verbosity for the pipeline logger. | +| `EVAL_OPT_USD_PER_1M_TOKENS` | `1.0` | USD price per 1M tokens used to estimate live evaluation cost. | +| `EVAL_OPT_CALL_TIMEOUT` | `120` | Per-call timeout (seconds) for live agent calls. | +| `EVAL_OPT_CALL_ATTEMPTS` | `3` | Max attempts per live agent call (exponential backoff between retries). | +| `EVAL_OPT_CALL_BACKOFF` | `1.0` | Backoff base in seconds (delay = base * 2^attempt + jitter). | + +## 7. Cost Accounting And Its Limits + +The audit report splits spend into optimizer cost (reported by `AgentOptimizer`) and evaluation cost (estimated from the tokens accumulated across the four baseline/candidate evaluation passes at the `EVAL_OPT_USD_PER_1M_TOKENS` rate). The `cost_budget` gate checks the total. + +Be aware of what the gate is and is not: it is a **post-hoc audit check** — it can reject a candidate whose search cost exceeded budget, but the money is already spent by then. The in-run spend cap for live mode is `max_metric_calls` in `optimizer.sdk.json`; size it to your budget before launching a live run. + +## 8. Customizing The Agent + +Edit `agent/agent.py` when connecting a real business agent. + +Key constraints: + +- `make_call_agent(prompt_path)` must return an async function with the exact optimizer contract `async (query: str) -> str`. +- `create_agent(prompt_path)` must re-read the prompt file every time so candidates written by `AgentOptimizer` take effect immediately. +- `TargetPrompt.add_path("system_prompt", path)` must point to the same prompt file that the agent actually reads. +- For HTTP, CLI, remote config, or multi-agent pipelines, keep the outer contract the same and replace only the bridge implementation. + +The outer report still computes richer trace-style scoring. The SDK optimizer itself receives final-text responses through `call_agent`, so `optimizer.sdk.json` intentionally avoids metrics that require full session traces. + +## 9. Design And Validation + +Failure attribution is rule-based over structured signals, not case ids. Each case records final response, tool trajectory, rubric sub-scores, and expected/actual tool calls. Rubric failures map to `format_error` or `llm_rubric_not_met`; tool mismatches map to tool, parameter, spurious-call, or knowledge-recall categories. + +The gate is validation-first. A candidate is accepted only if validation mean improves by the configured threshold, no new hard fail appears, key validation cases do not regress, train improvement does not coincide with validation loss, and cost is within budget. + +The bundled fake candidate intentionally improves two train cases and one validation case while damaging two key validation cases. The expected sample decision is `REJECT`, demonstrating overfit rejection: + +```bash +python examples/optimization/eval_optimize_loop/run.py --mode fake +``` + +```text +train: 0.25 -> 0.7833 +validation: 0.7333 -> 0.6667 +decision: REJECT +``` + +## 10. 方案设计说明 + +**失败归因。** 归因完全基于结构化评测信号而非 case 命名:每条 case 记录 final_response、tool_trajectory、rubric 三个子分与期望/实际工具轨迹。工具轨迹不匹配时按规则判类——期望依赖权威检索工具(`case_meta.json` 的 `authoritative_tool`)却未调用,记为知识召回不足;调全期望工具又额外多调,记为多余工具调用;首个工具同名但参数不同,记为参数错误;其余记为工具调用错误。rubric 失败按声明维度映射为格式错误或 rubric 不达标。`case_meta.json` 为每条 case 声明期望类别 `category`,报告内置归因自检,样例 4 条失败 case 全部归因正确。 + +**接受策略。** gate 以验证集为先,五项可配置检查全部通过才接受候选:验证集均分提升达到阈值、无新增 hard fail、关键 case(`key=true`)不退化、非过拟合、总成本(优化器花费加按 token 估算的评测花费)不超预算。各检查相互独立,拒绝理由逐项落盘,便于定位。 + +**防过拟合。** 训练集与验证集物理分离且启动时校验为不同文件;第四项检查专门拦截"训练集提升、验证集下降"的候选;关键 case 退化与新增 hard fail 两项提供正交的二次保险,即便总分变化很小也能拦住有害候选。随附样例即演示该场景:候选使训练集 +0.53 但验证集回落,gate 正确拒绝。 + +**产物审计。** 每次运行写入独立的 `runs/<时间戳>_/` 目录(append-only,历史不被覆盖),含 baseline 与候选 prompt 快照、JSON/Markdown 双报告;报告记录逐 case 分数与轨迹、逐 case delta、归因统计与自检准确率、每项 gate 检查与决策理由、成本与 token 的优化器/评测拆分、耗时、GEPA 随机种子、prompt 的 SHA-256 以及完整配置快照。run_id 同时注入日志行,跨产物可对齐追溯。 + +## 11. Tests + +The attribution rules, rubric scorer, gate checks, case diffing, and config validation are covered by IO-free unit tests: + +```bash +python -m pytest examples/optimization/eval_optimize_loop/tests -q +``` + +Known limits: live mode requires SDK dependencies plus `TRPC_AGENT_API_KEY`, `TRPC_AGENT_BASE_URL`, and `TRPC_AGENT_MODEL_NAME`; no-key environments should use `--mode fake`. The live retry logic matches rate-limit errors by exception type name because provider SDK exception classes are intentionally not imported here. diff --git a/examples/optimization/eval_optimize_loop/agent/__init__.py b/examples/optimization/eval_optimize_loop/agent/__init__.py new file mode 100644 index 0000000..ffb71af --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/__init__.py @@ -0,0 +1 @@ +"""Agent bridge package for the eval_optimize_loop example.""" diff --git a/examples/optimization/eval_optimize_loop/agent/agent.py b/examples/optimization/eval_optimize_loop/agent/agent.py new file mode 100644 index 0000000..1577059 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/agent/agent.py @@ -0,0 +1,197 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Live agent bridge for the eval_optimize_loop example. + +The optimizer contract is intentionally small: ``call_agent`` is an async +function that accepts one user query and returns the final response text. This +module re-reads the prompt file on every invocation so prompt candidates written +by AgentOptimizer take effect immediately. + +The public bridge in this file mirrors the SDK docs: + +* ``create_agent`` builds a fresh ``LlmAgent`` from the current prompt file. +* ``run_agent`` drives that agent through ``Runner`` and ``InMemorySessionService``. +* ``make_call_agent`` returns the exact async callable required by + ``AgentOptimizer.optimize`` when a ``TargetPrompt`` is registered. +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import random +import uuid +from pathlib import Path +from typing import Any +from typing import Awaitable +from typing import Callable + +from trpc_agent_sdk.agents import LlmAgent +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.runners import Runner +from trpc_agent_sdk.sessions import InMemorySessionService +from trpc_agent_sdk.tools import FunctionTool +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +APP_NAME = "eval_optimize_loop" +LOGGER = logging.getLogger("eval_optimize_loop.agent") + +# Live-call resilience knobs. A single flaky network call must not abort the +# whole pipeline after real money was already spent on earlier evaluations. +CALL_TIMEOUT_SECONDS = float(os.getenv("EVAL_OPT_CALL_TIMEOUT", "120")) +CALL_MAX_ATTEMPTS = int(os.getenv("EVAL_OPT_CALL_ATTEMPTS", "3")) +CALL_BACKOFF_BASE_SECONDS = float(os.getenv("EVAL_OPT_CALL_BACKOFF", "1.0")) + + +def is_retryable(exc: BaseException) -> bool: + """Decide whether a failed live call is worth retrying. + + Connection problems and timeouts are always retryable. Provider SDK + exception classes are not imported here (the example must stay usable + without them), so rate-limit style errors are matched by type name. + """ + if isinstance(exc, (asyncio.TimeoutError, ConnectionError, OSError)): + return True + name = type(exc).__name__.lower() + return "ratelimit" in name or "timeout" in name or "connection" in name + + +def lookup_order(order_id: str) -> str: + """FunctionTool body used by the live ``LlmAgent`` example.""" + data = { + "A100": "Order A100 is in transit and arrives on Friday.", + "A200": "Order A200 is delivered.", + } + return data.get(order_id, f"No order record found for {order_id}.") + + +def search_policy(topic: str) -> str: + """FunctionTool body for policy and warranty lookup examples.""" + topic_lower = topic.lower() + if "damaged" in topic_lower or "refund" in topic_lower: + return "Damaged items are eligible for a full refund within 30 days." + if "model z" in topic_lower or "warranty" in topic_lower: + return "Model Z has a 24-month warranty." + return "No matching policy snippet was found." + + +def get_model_config() -> tuple[str, str, str]: + """Read live model credentials consumed by ``OpenAIModel``.""" + api_key = os.getenv("TRPC_AGENT_API_KEY", "") + base_url = os.getenv("TRPC_AGENT_BASE_URL", "") + model_name = os.getenv("TRPC_AGENT_MODEL_NAME", "") + if not api_key or not base_url or not model_name: + raise ValueError( + "Live mode requires TRPC_AGENT_API_KEY, TRPC_AGENT_BASE_URL, and " + "TRPC_AGENT_MODEL_NAME. Use --mode fake for the no-key path." + ) + return api_key, base_url, model_name + + +def create_agent(prompt_path: Path) -> LlmAgent: + """Create a fresh ``LlmAgent`` from the current prompt file. + + Re-reading here is the critical TargetPrompt contract: when + ``AgentOptimizer`` writes a candidate prompt, the next call immediately uses + that candidate without restarting the process. + """ + api_key, base_url, model_name = get_model_config() + instruction = Path(prompt_path).read_text(encoding="utf-8").strip() + return LlmAgent( + name="support_assistant", + description="A support assistant whose system prompt is under optimization.", + model=OpenAIModel(model_name=model_name, api_key=api_key, base_url=base_url), + instruction=instruction, + tools=[FunctionTool(lookup_order), FunctionTool(search_policy)], + ) + + +async def run_agent_once(query: str, prompt_path: Path) -> dict[str, Any]: + """Run the live agent once and collect text, tool calls, and token usage. + + ``AgentOptimizer.optimize`` only needs final response text, but the outer + issue-level report also wants key trajectory information and per-call token + counts for the cost audit. This richer helper supports all of them. + """ + agent = create_agent(prompt_path) + session_service = InMemorySessionService() + runner = Runner(app_name=APP_NAME, agent=agent, session_service=session_service) + session_id = str(uuid.uuid4()) + user_id = "optimizer" + await session_service.create_session( + app_name=APP_NAME, + user_id=user_id, + session_id=session_id, + state={}, + ) + message = Content(role="user", parts=[Part.from_text(text=query)]) + final_text = "" + tools: list[dict[str, Any]] = [] + tokens = 0 + async for event in runner.run_async( + user_id=user_id, + session_id=session_id, + new_message=message, + ): + usage = getattr(event, "usage_metadata", None) + if usage is not None: + tokens += getattr(usage, "total_token_count", None) or 0 + if not event.content or not event.content.parts: + continue + for part in event.content.parts: + function_call = getattr(part, "function_call", None) + if function_call is not None: + tools.append( + { + "name": getattr(function_call, "name", None), + "args": dict(getattr(function_call, "args", {}) or {}), + } + ) + if event.is_final_response(): + for part in event.content.parts: + if getattr(part, "text", None) and not getattr(part, "thought", False): + final_text += part.text + return {"text": final_text.strip(), "tools": tools, "tokens": tokens} + + +async def run_agent(query: str, prompt_path: Path) -> dict[str, Any]: + """Run the live agent with timeout plus exponential-backoff retry. + + Each attempt builds a fresh agent and session, so retrying is idempotent. + Non-retryable errors (see ``is_retryable``) propagate immediately. + """ + for attempt in range(1, CALL_MAX_ATTEMPTS + 1): + try: + return await asyncio.wait_for( + run_agent_once(query=query, prompt_path=prompt_path), + timeout=CALL_TIMEOUT_SECONDS, + ) + except Exception as exc: + if attempt == CALL_MAX_ATTEMPTS or not is_retryable(exc): + raise + delay = CALL_BACKOFF_BASE_SECONDS * 2 ** (attempt - 1) + random.uniform(0, 0.5) + LOGGER.warning( + "live agent call failed (%s: %s), retry %d/%d in %.1fs", + type(exc).__name__, + str(exc)[:200], + attempt, + CALL_MAX_ATTEMPTS - 1, + delay, + ) + await asyncio.sleep(delay) + raise RuntimeError("unreachable: retry loop exited without returning or raising") + + +def make_call_agent(prompt_path: Path) -> Callable[[str], Awaitable[str]]: + """Return the fixed async ``(query: str) -> str`` bridge required by GEPA.""" + + async def call_agent(query: str) -> str: + return (await run_agent(query=query, prompt_path=prompt_path))["text"] + + return call_agent diff --git a/examples/optimization/eval_optimize_loop/case_meta.json b/examples/optimization/eval_optimize_loop/case_meta.json new file mode 100644 index 0000000..bbbb69e --- /dev/null +++ b/examples/optimization/eval_optimize_loop/case_meta.json @@ -0,0 +1,35 @@ +{ + "_comment": "Per-case metadata for attribution, gate checks, and fake/live trace scoring. It is kept outside evalsets so EvalSet schema validation remains clean.", + "train_order_lookup_optimizable": { + "category": "tool_call_error", + "key": false, + "rubric": "none" + }, + "train_refund_policy_optimizable": { + "category": "knowledge_recall_insufficient", + "key": false, + "rubric": "none", + "authoritative_tool": "search_policy" + }, + "train_json_format_ineffective": { + "category": "format_error", + "key": false, + "rubric": "json_format" + }, + "val_warranty_new_pass": { + "category": "knowledge_recall_insufficient", + "key": false, + "rubric": "none", + "authoritative_tool": "search_policy" + }, + "val_smalltalk_regression": { + "category": "spurious_tool_call", + "key": true, + "rubric": "no_tool" + }, + "val_order_soft_degradation": { + "category": "spurious_tool_call", + "key": true, + "rubric": "single_tool" + } +} diff --git a/examples/optimization/eval_optimize_loop/optimizer.json b/examples/optimization/eval_optimize_loop/optimizer.json new file mode 100644 index 0000000..be48160 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimizer.json @@ -0,0 +1,39 @@ +{ + "mode": "fake", + "inputs": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json" + }, + "target_prompt": { + "name": "support_assistant_system", + "path": "prompts/system.md", + "kind": "system_prompt" + }, + "evaluate": { + "pass_threshold": 0.8, + "metrics": [ + {"name": "final_response", "weight": 0.45}, + {"name": "tool_trajectory", "weight": 0.35}, + {"name": "rubric", "weight": 0.20} + ] + }, + "optimize": { + "sdk_config": "optimizer.sdk.json", + "update_source": false, + "verbose": 1, + "fake_candidate_patch": [ + "Optimization candidate:", + "- USE_CATALOG_LOOKUP: use lookup_order for order status and search_policy for policy/warranty questions before answering.", + "- AGGRESSIVE_LOOKUP: when uncertain, prefer looking up supporting data even for short or already-answerable requests." + ] + }, + "gate": { + "min_val_score_gain": 0.1, + "reject_on_new_hard_fail": true, + "hard_fail_threshold": 0.6, + "reject_on_critical_regression": true, + "reject_overfit_train_up_val_down": true, + "max_cost_usd": 0.05 + } +} diff --git a/examples/optimization/eval_optimize_loop/optimizer.sdk.json b/examples/optimization/eval_optimize_loop/optimizer.sdk.json new file mode 100644 index 0000000..7036b42 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/optimizer.sdk.json @@ -0,0 +1,70 @@ +{ + "_comment": "SDK-format config consumed only by AgentOptimizer.optimize in live mode. Tool-trajectory metrics are intentionally excluded because call_agent mode returns final text, not full session traces.", + "evaluate": { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 0.6, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": true} + } + } + }, + { + "metric_name": "llm_rubric_response", + "threshold": 0.66, + "criterion": { + "llm_judge": { + "judge_model": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "num_samples": 1, + "generation_config": {"max_tokens": 1024, "temperature": 0.2} + }, + "rubrics": [ + { + "id": "answers_with_evidence", + "content": { + "text": "The answer directly satisfies the user request and uses available support data instead of guessing when order, refund, or warranty facts are needed." + }, + "description": "Direct answer grounded in available support data", + "type": "FINAL_RESPONSE_QUALITY" + }, + { + "id": "concise_and_actionable", + "content": { + "text": "The answer is concise, actionable, and avoids unnecessary extra explanation." + }, + "description": "Concise actionable response", + "type": "FINAL_RESPONSE_QUALITY" + } + ] + } + } + } + ], + "num_runs": 1 + }, + "optimize": { + "eval_case_parallelism": 2, + "stop": {"required_metrics": "all"}, + "algorithm": { + "name": "gepa_reflective", + "seed": 42, + "reflection_lm": { + "model_name": "${TRPC_AGENT_MODEL_NAME}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "api_key": "${TRPC_AGENT_API_KEY}", + "generation_config": {"max_tokens": 4096, "temperature": 0.6} + }, + "candidate_selection_strategy": "pareto", + "module_selector": "round_robin", + "reflection_minibatch_size": 3, + "skip_perfect_score": false, + "max_metric_calls": 60, + "max_iterations_without_improvement": 8 + } + } +} diff --git a/examples/optimization/eval_optimize_loop/prompts/system.md b/examples/optimization/eval_optimize_loop/prompts/system.md new file mode 100644 index 0000000..47e4741 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/prompts/system.md @@ -0,0 +1,6 @@ +You are a concise customer support assistant. + +Rules: +- Answer directly when the answer is already known. +- Do not invent order, refund, or warranty facts. +- Keep responses short. diff --git a/examples/optimization/eval_optimize_loop/run.py b/examples/optimization/eval_optimize_loop/run.py new file mode 100644 index 0000000..0f7a0b7 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/run.py @@ -0,0 +1,1041 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Six-stage Evaluation + Optimization loop around AgentOptimizer. + +This example is intentionally self-contained. It has two execution modes: + +* fake: no API key, deterministic fake model/judge/optimizer, complete report. +* live: real LlmAgent bridge plus real AgentOptimizer.optimize. + +Both modes use the same train/validation evalsets, scorer, gate, report schema, +and prompt snapshots. The fake mode exists so the closed-loop behavior can be +tested in CI or on a laptop with no model credentials. + +The live path registers one ``TargetPrompt`` field and delegates candidate search +to ``AgentOptimizer``. Each run persists its artifacts (prompt snapshots, raw +optimizer ``RoundRecord`` files, reports) under a timestamped ``runs/_`` +directory, mirrored to ``runs/latest``; this outer script adds the issue-level +baseline/candidate/delta/gate/audit report around those SDK artifacts. + +Environment variables: + +* ``EVAL_OPT_LOG_LEVEL``: log verbosity (default ``INFO``). +* ``EVAL_OPT_USD_PER_1M_TOKENS``: USD price per 1M tokens used to estimate + live evaluation cost (default ``1.0``). +* ``EVAL_OPT_CALL_TIMEOUT`` / ``EVAL_OPT_CALL_ATTEMPTS`` / + ``EVAL_OPT_CALL_BACKOFF``: live agent-call timeout seconds, retry attempts, + and exponential-backoff base seconds (see ``agent/agent.py``). +""" + +from __future__ import annotations + +import argparse +import asyncio +import hashlib +import json +import logging +import os +import shutil +import sys +import time +import uuid +from collections import Counter +from dataclasses import asdict +from dataclasses import dataclass +from datetime import datetime +from datetime import timezone +from pathlib import Path +from typing import Any + + +HERE = Path(__file__).resolve().parent +REPO_ROOT = HERE.parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) +if str(HERE) not in sys.path: + sys.path.insert(0, str(HERE)) + +try: + from trpc_agent_sdk.evaluation import AgentEvaluator + from trpc_agent_sdk.evaluation import AgentOptimizer + from trpc_agent_sdk.evaluation import EvalSet + from trpc_agent_sdk.evaluation import TargetPrompt + SDK_IMPORT_ERROR = None +except Exception as exc: # pragma: no cover - fake mode should still explain itself. + AgentEvaluator = None + AgentOptimizer = None + EvalSet = None + TargetPrompt = None + SDK_IMPORT_ERROR = f"{type(exc).__name__}: {exc}" + + +LOGGER = logging.getLogger("eval_optimize_loop") + +# Prompt feature flags recognized by fake_agent. The scripted candidate patch in +# optimizer.json must mention at least one of them, otherwise the fake candidate +# cannot change agent behavior; check_fake_patch_flags warns on that mismatch. +FAKE_FLAG_USE_CATALOG = "USE_CATALOG_LOOKUP" +FAKE_FLAG_AGGRESSIVE = "AGGRESSIVE_LOOKUP" + +# The three metric names the scorer combines; validate_config enforces that +# optimizer.json defines exactly these with weights summing to 1.0. +REQUIRED_METRICS = ("final_response", "tool_trajectory", "rubric") + + +@dataclass +class CaseResult: + """Per-case score record persisted into optimization_report.json. + + The fields mirror the issue acceptance criteria: metric scores, pass/fail, + hard-fail status, failure reasons, and key trajectory data. + """ + + case_id: str + score: float + passed: bool + hard_fail: bool + key: bool + metrics: dict[str, float] + failure_types: list[str] + reason: str + trace: dict[str, Any] + + +def load_json(path: Path) -> dict[str, Any]: + """Load a JSON config/evalset document with a readable fatal error.""" + try: + return json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise SystemExit(f"Cannot read JSON file {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise SystemExit(f"Invalid JSON in {path}: {exc}") from exc + + +def write_json(path: Path, data: dict[str, Any]) -> None: + """Write a stable UTF-8 JSON artifact used by the audit trail.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + + +def sha256_text(text: str) -> str: + """Hash prompt text so audits can prove which candidate was evaluated.""" + return hashlib.sha256(text.encode("utf-8")).hexdigest() + + +def resolve_path(value: str) -> Path: + """Resolve example-relative paths from optimizer.json and CLI flags.""" + path = Path(value) + return path if path.is_absolute() else HERE / path + + +def validate_evalset(path: Path) -> dict[str, Any]: + """Validate an evalset with SDK ``EvalSet`` when available.""" + try: + raw = path.read_text(encoding="utf-8") + except OSError as exc: + raise SystemExit(f"Cannot read evalset {path}: {exc}") from exc + try: + data = json.loads(raw) + except json.JSONDecodeError as exc: + raise SystemExit(f"Invalid JSON in evalset {path}: {exc}") from exc + if EvalSet is not None: + try: + EvalSet.model_validate_json(raw) + except Exception as exc: + raise SystemExit(f"{path} failed SDK EvalSet schema validation: {exc}") from exc + if not data.get("eval_cases"): + raise SystemExit(f"{path} has no eval_cases") + return data + + +def validate_config(cfg: dict[str, Any]) -> None: + """Fail fast with readable errors on structural optimizer.json problems. + + Raw ``cfg[...]`` indexing later in the pipeline would surface a config typo + as a bare ``KeyError``; this check turns it into an actionable message. + """ + for section, keys in ( + ("inputs", ("train_evalset", "val_evalset", "case_meta")), + ("target_prompt", ("name", "path", "kind")), + ("evaluate", ("pass_threshold", "metrics")), + ("optimize", ("sdk_config", "fake_candidate_patch")), + ( + "gate", + ( + "min_val_score_gain", + "reject_on_new_hard_fail", + "hard_fail_threshold", + "reject_on_critical_regression", + "reject_overfit_train_up_val_down", + "max_cost_usd", + ), + ), + ): + block = cfg.get(section) + if not isinstance(block, dict): + raise SystemExit(f"optimizer.json is missing the '{section}' section") + missing = [key for key in keys if key not in block] + if missing: + raise SystemExit(f"optimizer.json section '{section}' is missing keys: {', '.join(missing)}") + metrics = cfg["evaluate"]["metrics"] + names = [item.get("name") for item in metrics] + if sorted(names) != sorted(REQUIRED_METRICS): + raise SystemExit(f"evaluate.metrics must define exactly {sorted(REQUIRED_METRICS)}, got {sorted(names)}") + total_weight = sum(float(item["weight"]) for item in metrics) + if abs(total_weight - 1.0) > 1e-6: + raise SystemExit(f"evaluate.metrics weights must sum to 1.0, got {total_weight}") + for name, value in ( + ("evaluate.pass_threshold", cfg["evaluate"]["pass_threshold"]), + ("gate.hard_fail_threshold", cfg["gate"]["hard_fail_threshold"]), + ): + if not 0.0 <= float(value) <= 1.0: + raise SystemExit(f"{name} must be within [0, 1], got {value}") + + +def check_fake_patch_flags(cfg: dict[str, Any]) -> None: + """Warn when the scripted candidate cannot influence the fake agent.""" + patch = "\n".join(cfg["optimize"]["fake_candidate_patch"]) + if FAKE_FLAG_USE_CATALOG not in patch and FAKE_FLAG_AGGRESSIVE not in patch: + LOGGER.warning( + "fake_candidate_patch mentions neither %s nor %s; the fake candidate will not change behavior.", + FAKE_FLAG_USE_CATALOG, + FAKE_FLAG_AGGRESSIVE, + ) + + +async def sdk_trace_smoke(evalset_path: Path) -> dict[str, Any]: + """Run SDK ``AgentEvaluator`` on one trace-mode evalset. + + The outer loop has its own deterministic scorer so fake mode remains usable + even when optional SDK dependencies are missing. When ``AgentEvaluator`` is + importable, this function records a real trace-mode SDK evaluation attempt + for both train and validation sets. + """ + metrics_path = HERE / "_sdk_eval_metrics.json" + write_json( + metrics_path, + { + "metrics": [ + { + "metric_name": "final_response_avg_score", + "threshold": 0.1, + "criterion": { + "final_response": { + "text": {"match": "contains", "case_insensitive": True} + } + }, + } + ] + }, + ) + if AgentEvaluator is None: + return { + "status": "SKIPPED", + "reason": "AgentEvaluator import failed", + "import_error": SDK_IMPORT_ERROR, + } + + cwd = Path.cwd() + os.chdir(HERE) + try: + runner = AgentEvaluator.get_executer( + evalset_path.name, + eval_metrics_file_path_or_dir=metrics_path.name, + print_detailed_results=False, + print_summary_report=False, + ) + try: + await asyncio.wait_for(runner.evaluate(), timeout=30) + status = "PASSED" + reason = "AgentEvaluator trace-mode evaluation completed" + except AssertionError as exc: + status = "FAILED_EXPECTED" + reason = str(exc)[:500] + except Exception as exc: # pragma: no cover - SDK runtime drift. + status = "FAILED_SDK_SMOKE" + reason = f"{type(exc).__name__}: {str(exc)[:500]}" + return { + "status": status, + "reason": reason, + "evalset": evalset_path.name, + "has_result": runner.get_result() is not None, + "metrics_file": metrics_path.name, + } + finally: + os.chdir(cwd) + + +def text_field(invocation: dict[str, Any], field_name: str) -> str: + """Extract concatenated text from an EvalCase invocation field.""" + content = invocation.get(field_name) or {} + return "".join(part.get("text", "") for part in content.get("parts", [])) + + +def expected_tools(invocation: dict[str, Any]) -> list[dict[str, Any]]: + """Return expected tool calls from an EvalCase trace invocation.""" + return (invocation.get("intermediate_data") or {}).get("tool_uses") or [] + + +def normalize_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Normalize tool calls for order-sensitive trajectory comparison.""" + return [{"name": item.get("name"), "args": item.get("args", {})} for item in tools] + + +def fake_agent(prompt: str, query: str) -> dict[str, Any]: + """Deterministic fake model used by fake mode. + + The fake reads prompt feature flags written by the scripted optimizer. This + gives repeatable behavior changes without a model key or remote service. + """ + use_catalog = FAKE_FLAG_USE_CATALOG in prompt + aggressive_lookup = FAKE_FLAG_AGGRESSIVE in prompt + + if "shipping status for order A100" in query: + if use_catalog: + return { + "text": "Order A100 is in transit and arrives on Friday.", + "tools": [{"name": "lookup_order", "args": {"order_id": "A100"}}], + } + return {"text": "I do not have enough order data.", "tools": []} + + if "refund policy for damaged items" in query: + if use_catalog: + return { + "text": "Damaged items are eligible for a full refund within 30 days.", + "tools": [{"name": "search_policy", "args": {"topic": "damaged item refund"}}], + } + return {"text": "You may be eligible, but I cannot confirm the policy.", "tools": []} + + if "Return only JSON" in query: + return {"text": "status ok", "tools": []} + + if "warranty period for Model Z" in query: + if use_catalog: + return { + "text": "Model Z has a 24-month warranty.", + "tools": [{"name": "search_policy", "args": {"topic": "Model Z warranty"}}], + } + return { + "text": "I am not sure about the Model Z warranty.", + "tools": [{"name": "web_search", "args": {"query": "Model Z warranty"}}], + } + + if query.strip() == "Thanks": + if aggressive_lookup: + return { + "text": "You are welcome.", + "tools": [{"name": "search_policy", "args": {"topic": "thanks"}}], + } + return {"text": "You are welcome.", "tools": []} + + if "order A200" in query: + if aggressive_lookup: + return { + "text": "Order A200 is delivered.", + "tools": [ + {"name": "lookup_order", "args": {"order_id": "A200"}}, + {"name": "search_policy", "args": {"topic": "order A200"}}, + ], + } + return { + "text": "Order A200 is delivered.", + "tools": [{"name": "lookup_order", "args": {"order_id": "A200"}}], + } + + return {"text": "I can help with support questions.", "tools": []} + + +def rubric_score(meta: dict[str, Any], output: dict[str, Any]) -> float: + """Score the case-specific fake judge rubric declared in case_meta.json.""" + kind = meta.get("rubric", "none") + if kind == "json_format": + try: + parsed = json.loads(output["text"]) + except json.JSONDecodeError: + return 0.0 + # A bare scalar such as "123" is valid JSON but not the JSON-object + # reply the rubric is asking for. + return 1.0 if isinstance(parsed, dict) else 0.0 + if kind == "no_tool": + return 1.0 if not output["tools"] else 0.0 + if kind == "single_tool": + return 1.0 if len(output["tools"]) <= 1 else 0.5 + return 1.0 + + +def classify_tool_failure( + actual: list[dict[str, Any]], + expected: list[dict[str, Any]], + meta: dict[str, Any], +) -> str | None: + """Cluster tool trajectory failures into issue-required categories.""" + actual_norm = normalize_tools(actual) + expected_norm = normalize_tools(expected) + if actual_norm == expected_norm: + return None + actual_names = {item["name"] for item in actual_norm} + authoritative = meta.get("authoritative_tool") + if authoritative and authoritative not in actual_names: + return "knowledge_recall_insufficient" + if actual_norm and all(item in actual_norm for item in expected_norm) and len(actual_norm) > len(expected_norm): + return "spurious_tool_call" + if actual_norm and expected_norm and actual_norm[0]["name"] == expected_norm[0]["name"]: + return "parameter_error" + return "tool_call_error" + + +def classify_rubric_failure(meta: dict[str, Any]) -> str: + """Map failed rubric dimensions to human-auditable failure labels.""" + if meta.get("rubric") == "json_format": + return "format_error" + return "llm_rubric_not_met" + + +def failure_types_for( + meta: dict[str, Any], + final_score: float, + tool_score: float, + rubric: float, + output: dict[str, Any], + expected: list[dict[str, Any]], +) -> list[str]: + """Collect all failure labels for one case from metric sub-scores.""" + failures: list[str] = [] + if final_score < 1.0: + failures.append("final_response_mismatch") + if tool_score < 1.0: + label = classify_tool_failure(output["tools"], expected, meta) + if label: + failures.append(label) + if rubric < 1.0: + failures.append(classify_rubric_failure(meta)) + return list(dict.fromkeys(failures)) + + +async def produce_output(query: str, prompt_path: Path, mode: str) -> dict[str, Any]: + """Run either the fake agent or the live ``LlmAgent`` bridge.""" + prompt_text = prompt_path.read_text(encoding="utf-8") + if mode == "live": + from agent.agent import run_agent + + return await run_agent(query=query, prompt_path=prompt_path) + return fake_agent(prompt_text, query) + + +def score_case( + case: dict[str, Any], + output: dict[str, Any], + cfg: dict[str, Any], + case_meta: dict[str, Any], +) -> CaseResult: + """Score one EvalCase against already-produced model output.""" + invocation = case["conversation"][0] + case_id = case["eval_id"] + query = text_field(invocation, "user_content") + expected_text = text_field(invocation, "final_response") + expected = expected_tools(invocation) + meta = case_meta.get(case_id, {}) + + final_score = 1.0 if expected_text.lower() in output["text"].lower() else 0.0 + tool_score = 1.0 if normalize_tools(output["tools"]) == normalize_tools(expected) else 0.0 + rubric = rubric_score(meta, output) + weights = {item["name"]: item["weight"] for item in cfg["evaluate"]["metrics"]} + score = round( + final_score * weights["final_response"] + + tool_score * weights["tool_trajectory"] + + rubric * weights["rubric"], + 4, + ) + failures = failure_types_for(meta, final_score, tool_score, rubric, output, expected) + passed = score >= cfg["evaluate"]["pass_threshold"] + return CaseResult( + case_id=case_id, + score=score, + passed=passed, + hard_fail=score < cfg["gate"]["hard_fail_threshold"], + key=bool(meta.get("key", False)), + metrics={ + "final_response": final_score, + "tool_trajectory": tool_score, + "rubric": rubric, + }, + failure_types=failures, + reason="pass" if passed else "; ".join(failures or ["unknown"]), + trace={ + "query": query, + "expected_text": expected_text, + "actual_text": output["text"], + "expected_tools": expected, + "actual_tools": output["tools"], + }, + ) + + +async def evaluate_evalset( + evalset: dict[str, Any], + prompt_path: Path, + cfg: dict[str, Any], + case_meta: dict[str, Any], + mode: str, +) -> dict[str, Any]: + """Evaluate all cases in one train/validation evalset. + + Also accumulates the tokens each live agent call reports, so the audit and + the cost gate can account for evaluation spend, not only optimizer spend + (``tokens`` is always 0 in fake mode). + """ + cases: list[CaseResult] = [] + tokens = 0 + for case in evalset["eval_cases"]: + query = text_field(case["conversation"][0], "user_content") + output = await produce_output(query=query, prompt_path=prompt_path, mode=mode) + tokens += int(output.get("tokens", 0)) + cases.append(score_case(case, output, cfg, case_meta)) + mean = round(sum(item.score for item in cases) / len(cases), 4) + return { + "eval_set_id": evalset["eval_set_id"], + "mean_score": mean, + "pass_rate": round(sum(item.passed for item in cases) / len(cases), 4), + "cases": {item.case_id: asdict(item) for item in cases}, + "tokens": tokens, + } + + +def attribute_failures(*results: dict[str, Any]) -> dict[str, Any]: + """Cluster baseline failures and count each explanation type.""" + counts: Counter[str] = Counter() + by_case: dict[str, list[str]] = {} + for result in results: + for case_id, case in result["cases"].items(): + if case["passed"]: + continue + failures = case["failure_types"] or ["unknown"] + by_case[case_id] = failures + counts.update(failures) + return {"counts": dict(counts), "by_case": by_case} + + +def attribution_self_check(failures: dict[str, Any], case_meta: dict[str, Any]) -> dict[str, Any]: + """Measure rule-based attribution against the expected category per case. + + ``case_meta.json`` may declare a ground-truth ``category`` for each case; + the issue acceptance criteria require >= 75% attribution accuracy, so the + report carries this self-check instead of leaving the number unverifiable. + """ + total = 0 + matched = 0 + by_case: dict[str, dict[str, Any]] = {} + for case_id, labels in failures["by_case"].items(): + expected = case_meta.get(case_id, {}).get("category") + if not expected: + continue + total += 1 + hit = expected in labels + matched += int(hit) + by_case[case_id] = {"expected": expected, "attributed": labels, "matched": hit} + return { + "cases_with_expected_category": total, + "matched": matched, + "accuracy": round(matched / total, 4) if total else None, + "by_case": by_case, + } + + +def diff_cases(baseline: dict[str, Any], candidate: dict[str, Any]) -> dict[str, Any]: + """Compare candidate against baseline case-by-case.""" + mismatched = set(baseline["cases"]) ^ set(candidate["cases"]) + if mismatched: + raise SystemExit( + "baseline and candidate evaluations cover different case ids: " + ", ".join(sorted(mismatched)) + ) + delta: dict[str, Any] = {} + for case_id, cand in candidate["cases"].items(): + base = baseline["cases"][case_id] + if not base["passed"] and cand["passed"]: + kind = "new_pass" + elif base["passed"] and not cand["passed"]: + kind = "new_fail" + elif cand["score"] > base["score"]: + kind = "score_up" + elif cand["score"] < base["score"]: + kind = "score_down" + else: + kind = "same" + delta[case_id] = { + "kind": kind, + "baseline_score": base["score"], + "candidate_score": cand["score"], + "delta": round(cand["score"] - base["score"], 4), + "baseline_passed": base["passed"], + "candidate_passed": cand["passed"], + } + return delta + + +def gate_decision( + baseline_train: dict[str, Any], + candidate_train: dict[str, Any], + baseline_val: dict[str, Any], + candidate_val: dict[str, Any], + val_delta: dict[str, Any], + cfg: dict[str, Any], + cost_usd: float, +) -> dict[str, Any]: + """Apply the configurable validation-first acceptance gate.""" + gate = cfg["gate"] + train_gain = round(candidate_train["mean_score"] - baseline_train["mean_score"], 4) + val_gain = round(candidate_val["mean_score"] - baseline_val["mean_score"], 4) + new_hard_fails = [ + case_id + for case_id, case in candidate_val["cases"].items() + if case["hard_fail"] and not baseline_val["cases"][case_id]["hard_fail"] + ] + critical_regressions = [ + case_id + for case_id, delta in val_delta.items() + if candidate_val["cases"][case_id]["key"] and delta["kind"] in {"new_fail", "score_down"} + ] + checks = [ + { + "name": "validation_gain_threshold", + "passed": val_gain >= gate["min_val_score_gain"], + "detail": f"val_gain={val_gain:+.4f}, required>={gate['min_val_score_gain']:+.4f}", + }, + { + "name": "no_new_hard_fail", + "passed": not (gate["reject_on_new_hard_fail"] and new_hard_fails), + "detail": f"new_hard_fails={new_hard_fails}", + }, + { + "name": "no_critical_regression", + "passed": not (gate["reject_on_critical_regression"] and critical_regressions), + "detail": f"critical_regressions={critical_regressions}", + }, + { + "name": "not_overfit_train_up_val_down", + "passed": not (gate["reject_overfit_train_up_val_down"] and train_gain > 0 and val_gain < 0), + "detail": f"train_gain={train_gain:+.4f}, val_gain={val_gain:+.4f}", + }, + { + "name": "cost_budget", + "passed": cost_usd <= gate["max_cost_usd"], + "detail": f"cost_usd={cost_usd:.6f}, budget={gate['max_cost_usd']:.6f}", + }, + ] + accepted = all(item["passed"] for item in checks) + return { + "accepted": accepted, + "decision": "ACCEPT" if accepted else "REJECT", + "reason": "all gates passed" if accepted else "; ".join(item["name"] for item in checks if not item["passed"]), + "train_gain": train_gain, + "val_gain": val_gain, + "checks": checks, + } + + +def precheck_live_mode() -> None: + """Fail fast before live mode spends time evaluating a broken environment.""" + if AgentOptimizer is None or TargetPrompt is None: + raise SystemExit( + "Live mode requires trpc_agent_sdk.evaluation.AgentOptimizer and " + f"TargetPrompt. SDK import error: {SDK_IMPORT_ERROR}" + ) + missing = [ + name + for name in ("TRPC_AGENT_API_KEY", "TRPC_AGENT_BASE_URL", "TRPC_AGENT_MODEL_NAME") + if not os.getenv(name) + ] + if missing: + raise SystemExit( + "Live mode requires model credentials before baseline evaluation: " + + ", ".join(missing) + + ". Use --mode fake for the no-key path." + ) + + +def optimizer_fake(baseline_prompt: str, cfg: dict[str, Any], candidate_path: Path) -> tuple[str, dict[str, Any]]: + """Create a deterministic fake candidate without invoking AgentOptimizer.""" + candidate = baseline_prompt.rstrip() + "\n\n" + "\n".join(cfg["optimize"]["fake_candidate_patch"]) + "\n" + candidate_path.write_text(candidate, encoding="utf-8") + return candidate, { + "mode": "fake", + "status": "SCRIPTED_CANDIDATE", + "agent_optimizer_available": AgentOptimizer is not None, + "agent_optimizer_invoked": False, + "candidate_prompt_path": candidate_path.relative_to(HERE).as_posix(), + "cost_usd": 0.0, + "tokens": 0, + "rounds": 1, + } + + +async def optimizer_live( + source_prompt_path: Path, + train_path: Path, + val_path: Path, + cfg: dict[str, Any], + candidate_path: Path, + run_dir: Path, +) -> tuple[str, dict[str, Any]]: + """Invoke SDK ``AgentOptimizer.optimize`` for the registered TargetPrompt. + + The source prompt is the snapshot under ``runs/latest``. ``update_source`` is + configurable but defaults to false so the example produces candidates for + review rather than silently overwriting the baseline prompt. + """ + if AgentOptimizer is None or TargetPrompt is None: + raise SystemExit("Live mode requires trpc_agent_sdk.evaluation.AgentOptimizer and TargetPrompt.") + from agent.agent import make_call_agent + + sdk_config = resolve_path(cfg["optimize"]["sdk_config"]) + optimizer_dir = run_dir / "agent_optimizer" + target = TargetPrompt().add_path("system_prompt", str(source_prompt_path)) + started = time.perf_counter() + result = await AgentOptimizer.optimize( + config_path=str(sdk_config), + call_agent=make_call_agent(source_prompt_path), + target_prompt=target, + train_dataset_path=str(train_path), + validation_dataset_path=str(val_path), + output_dir=str(optimizer_dir), + update_source=bool(cfg["optimize"].get("update_source", False)), + verbose=int(cfg["optimize"].get("verbose", 1)), + ) + best = (result.best_prompts or {}).get("system_prompt") + if not best: + best = source_prompt_path.read_text(encoding="utf-8") + candidate_path.write_text(best, encoding="utf-8") + token_usage = getattr(result, "total_token_usage", None) or {} + return best, { + "mode": "live", + "status": getattr(result, "status", "UNKNOWN"), + "finish_reason": getattr(result, "finish_reason", None), + "agent_optimizer_available": True, + "agent_optimizer_invoked": True, + "sdk_output_dir": optimizer_dir.relative_to(HERE).as_posix(), + "candidate_prompt_path": candidate_path.relative_to(HERE).as_posix(), + "cost_usd": round(float(getattr(result, "total_llm_cost", 0.0) or 0.0), 6), + "tokens": token_usage.get("total", 0) if isinstance(token_usage, dict) else 0, + "rounds": getattr(result, "total_rounds", None), + "duration_seconds": round(time.perf_counter() - started, 4), + } + + +def read_gepa_seed(cfg: dict[str, Any]) -> int | None: + """Read the GEPA seed from optimizer.sdk.json for audit bookkeeping. + + The outer loop has no random source of its own; the only seed that matters + is the one the SDK optimizer consumes in live mode. + """ + sdk_path = resolve_path(cfg["optimize"]["sdk_config"]) + try: + sdk_cfg = json.loads(sdk_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + return None + algorithm = (sdk_cfg.get("optimize") or {}).get("algorithm") or {} + return algorithm.get("seed") + + +def build_report( + *, + mode: str, + run_id: str, + cfg: dict[str, Any], + baseline_prompt: str, + candidate_prompt: str, + snapshots: dict[str, str], + sdk_evaluator_runs: dict[str, Any], + artifacts: dict[str, Any], +) -> dict[str, Any]: + """Assemble the machine-readable issue-level audit report.""" + return { + "run": { + "timestamp": datetime.now(timezone.utc).isoformat(), + "run_id": run_id, + "mode": mode, + "gepa_seed": read_gepa_seed(cfg), + "sdk_bridge": { + "agent_evaluator_available": AgentEvaluator is not None, + "agent_optimizer_available": AgentOptimizer is not None, + "evalset_validated_with_trpc_sdk": EvalSet is not None, + "sdk_import_error": SDK_IMPORT_ERROR, + "agent_evaluator_trace_runs": sdk_evaluator_runs, + }, + "repro": { + "train_evalset": cfg["inputs"]["train_evalset"], + "val_evalset": cfg["inputs"]["val_evalset"], + "case_meta": cfg["inputs"]["case_meta"], + "prompt_source": cfg["target_prompt"]["path"], + "optimizer_config": "optimizer.json", + "sdk_optimizer_config": cfg["optimize"].get("sdk_config"), + }, + }, + "prompt_audit": { + "target": cfg["target_prompt"], + "baseline_sha256": sha256_text(baseline_prompt), + "candidate_sha256": sha256_text(candidate_prompt), + "baseline_snapshot": snapshots["baseline"], + "candidate_snapshot": snapshots["candidate"], + }, + **artifacts, + } + + +def render_summary(report: dict[str, Any]) -> str: + """Create a short human-readable decision summary for Markdown.""" + gate = report["gate"] + new_pass = [case_id for case_id, item in report["delta"]["val"].items() if item["kind"] == "new_pass"] + new_fail = [case_id for case_id, item in report["delta"]["val"].items() if item["kind"] == "new_fail"] + return ( + f"Decision: {gate['decision']}. " + f"Train mean changed {report['baseline']['train']['mean_score']} -> " + f"{report['candidate']['train']['mean_score']} ({gate['train_gain']:+.4f}); " + f"validation mean changed {report['baseline']['val']['mean_score']} -> " + f"{report['candidate']['val']['mean_score']} ({gate['val_gain']:+.4f}). " + f"New validation passes: {new_pass or 'none'}. " + f"New validation failures: {new_fail or 'none'}. " + f"Gate reason: {gate['reason']}." + ) + + +def write_markdown(report: dict[str, Any], path: Path) -> None: + """Write the human-readable optimization_report.md artifact.""" + lines = [ + "# Optimization Report", + "", + "## Summary", + "", + render_summary(report), + "", + "## Scores", + "", + f"- Mode: `{report['run']['mode']}`", + f"- Baseline train mean: {report['baseline']['train']['mean_score']}", + f"- Candidate train mean: {report['candidate']['train']['mean_score']}", + f"- Baseline validation mean: {report['baseline']['val']['mean_score']}", + f"- Candidate validation mean: {report['candidate']['val']['mean_score']}", + f"- Decision: **{report['gate']['decision']}**", + f"- Reason: {report['gate']['reason']}", + "", + "## Failure Attribution", + "", + ] + for name, count in sorted(report["failure_attribution"]["counts"].items()): + lines.append(f"- {name}: {count}") + self_check = report["failure_attribution"].get("self_check") or {} + if self_check.get("cases_with_expected_category"): + lines.append( + f"- Attribution self-check: {self_check['matched']}/{self_check['cases_with_expected_category']} " + f"expected categories matched (accuracy {self_check['accuracy']})" + ) + lines.extend(["", "## Validation Delta", ""]) + for case_id, item in report["delta"]["val"].items(): + lines.append( + f"- `{case_id}`: {item['kind']} " + f"({item['baseline_score']} -> {item['candidate_score']}, {item['delta']:+.4f})" + ) + lines.extend(["", "## Gate Checks", ""]) + for check in report["gate"]["checks"]: + status = "PASS" if check["passed"] else "FAIL" + lines.append(f"- {status} `{check['name']}`: {check['detail']}") + lines.extend( + [ + "", + "## Audit", + "", + f"- Cost USD: {report['audit']['cost_usd']} " + f"(optimizer {report['audit']['optimizer_cost_usd']}, eval {report['audit']['eval_cost_usd']})", + f"- Tokens: {report['audit']['tokens']} " + f"(optimizer {report['audit']['optimizer_tokens']}, eval {report['audit']['eval_tokens']})", + f"- Duration seconds: {report['audit']['duration_seconds']}", + f"- Baseline SHA-256: `{report['prompt_audit']['baseline_sha256']}`", + f"- Candidate SHA-256: `{report['prompt_audit']['candidate_sha256']}`", + ] + ) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +async def run_pipeline(args: argparse.Namespace) -> None: + """Run all six issue-required stages and persist report artifacts.""" + run_id = uuid.uuid4().hex[:8] + logging.basicConfig( + level=os.environ.get("EVAL_OPT_LOG_LEVEL", "INFO"), + format=f"%(asctime)s %(levelname)s %(name)s [{run_id}] | %(message)s", + ) + started = time.perf_counter() + cfg = load_json(resolve_path(args.optimizer)) + validate_config(cfg) + mode = args.mode or cfg.get("mode", "fake") + if mode not in {"fake", "live"}: + raise SystemExit("--mode must be fake or live") + if mode == "live": + precheck_live_mode() + else: + check_fake_patch_flags(cfg) + + train_path = resolve_path(args.train or cfg["inputs"]["train_evalset"]) + val_path = resolve_path(args.val or cfg["inputs"]["val_evalset"]) + if train_path.resolve() == val_path.resolve(): + raise SystemExit("train and validation evalset paths must be different") + prompt_source = resolve_path(args.prompt or cfg["target_prompt"]["path"]) + case_meta = { + key: value + for key, value in load_json(resolve_path(cfg["inputs"]["case_meta"])).items() + if not key.startswith("_") + } + train = validate_evalset(train_path) + val = validate_evalset(val_path) + + # Audit trails are append-only: every run gets its own timestamped + # directory, and runs/latest is only a convenience mirror of the newest one. + run_stamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + run_dir = HERE / "runs" / f"{run_stamp}_{run_id}" + run_dir.mkdir(parents=True, exist_ok=True) + baseline_path = run_dir / "baseline_prompt.md" + candidate_path = run_dir / "candidate_prompt.md" + baseline_prompt = prompt_source.read_text(encoding="utf-8") + baseline_path.write_text(baseline_prompt, encoding="utf-8") + + LOGGER.info("loaded mode=%s train_cases=%d val_cases=%d", mode, len(train["eval_cases"]), len(val["eval_cases"])) + sdk_evaluator_runs = { + "train": await sdk_trace_smoke(train_path), + "val": await sdk_trace_smoke(val_path), + } + LOGGER.info( + "AgentEvaluator trace runs: train=%s val=%s", + sdk_evaluator_runs["train"]["status"], + sdk_evaluator_runs["val"]["status"], + ) + + baseline_train = await evaluate_evalset(train, baseline_path, cfg, case_meta, mode) + baseline_val = await evaluate_evalset(val, baseline_path, cfg, case_meta, mode) + LOGGER.info("baseline mean train=%.4f val=%.4f", baseline_train["mean_score"], baseline_val["mean_score"]) + + failures = attribute_failures(baseline_train, baseline_val) + failures["self_check"] = attribution_self_check(failures, case_meta) + LOGGER.info( + "failure attribution: %s | self-check accuracy=%s (%d/%d cases with expected category)", + failures["counts"], + failures["self_check"]["accuracy"], + failures["self_check"]["matched"], + failures["self_check"]["cases_with_expected_category"], + ) + + if mode == "live": + candidate_prompt, optimizer_status = await optimizer_live( + source_prompt_path=baseline_path, + train_path=train_path, + val_path=val_path, + cfg=cfg, + candidate_path=candidate_path, + run_dir=run_dir, + ) + else: + candidate_prompt, optimizer_status = optimizer_fake(baseline_prompt, cfg, candidate_path) + LOGGER.info( + "optimizer status=%s invoked=%s", + optimizer_status["status"], + optimizer_status["agent_optimizer_invoked"], + ) + + candidate_train = await evaluate_evalset(train, candidate_path, cfg, case_meta, mode) + candidate_val = await evaluate_evalset(val, candidate_path, cfg, case_meta, mode) + LOGGER.info("candidate mean train=%.4f val=%.4f", candidate_train["mean_score"], candidate_val["mean_score"]) + + train_delta = diff_cases(baseline_train, candidate_train) + val_delta = diff_cases(baseline_val, candidate_val) + + # The four evaluation passes (baseline/candidate x train/val) spend real + # tokens in live mode too; counting only the optimizer's reported cost would + # systematically understate total spend. Evaluation cost is estimated from + # accumulated tokens at a configurable USD-per-1M-tokens rate. Note the + # budget gate is still a post-hoc audit: the in-run spend cap for live mode + # is max_metric_calls in optimizer.sdk.json. + eval_tokens = ( + baseline_train["tokens"] + + baseline_val["tokens"] + + candidate_train["tokens"] + + candidate_val["tokens"] + ) + usd_per_1m_tokens = float(os.environ.get("EVAL_OPT_USD_PER_1M_TOKENS", "1.0")) + eval_cost_usd = round(eval_tokens / 1e6 * usd_per_1m_tokens, 6) + total_cost_usd = round(optimizer_status["cost_usd"] + eval_cost_usd, 6) + + gate = gate_decision( + baseline_train, + candidate_train, + baseline_val, + candidate_val, + val_delta, + cfg, + total_cost_usd, + ) + for check in gate["checks"]: + LOGGER.info("gate %-30s %s | %s", check["name"], "PASS" if check["passed"] else "FAIL", check["detail"]) + + duration = round(time.perf_counter() - started, 4) + report = build_report( + mode=mode, + run_id=run_id, + cfg=cfg, + baseline_prompt=baseline_prompt, + candidate_prompt=candidate_prompt, + snapshots={ + "baseline": baseline_path.relative_to(HERE).as_posix(), + "candidate": candidate_path.relative_to(HERE).as_posix(), + }, + sdk_evaluator_runs=sdk_evaluator_runs, + artifacts={ + "baseline": {"train": baseline_train, "val": baseline_val}, + "candidate": {"train": candidate_train, "val": candidate_val}, + "delta": {"train": train_delta, "val": val_delta}, + "failure_attribution": failures, + "optimizer": optimizer_status, + "gate": gate, + "audit": { + "duration_seconds": duration, + "cost_usd": total_cost_usd, + "tokens": eval_tokens + optimizer_status["tokens"], + "optimizer_cost_usd": optimizer_status["cost_usd"], + "optimizer_tokens": optimizer_status["tokens"], + "eval_cost_usd": eval_cost_usd, + "eval_tokens": eval_tokens, + "config_snapshot": cfg, + }, + }, + ) + write_json(run_dir / "optimization_report.json", report) + write_markdown(report, run_dir / "optimization_report.md") + # Convenience copies at well-known paths; both are gitignored because they + # change (timestamp/duration) on every run. + write_json(HERE / "optimization_report.json", report) + write_markdown(report, HERE / "optimization_report.md") + latest = HERE / "runs" / "latest" + if latest.exists(): + shutil.rmtree(latest) + shutil.copytree(run_dir, latest) + print(f"{gate['decision']}: {gate['reason']}") + print(f"wrote optimization_report.json / .md (history: {run_dir.relative_to(HERE).as_posix()})") + + +def parse_args() -> argparse.Namespace: + """Parse CLI flags for fake/live mode and alternate input files.""" + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--mode", choices=["fake", "live"], default=None) + parser.add_argument("--optimizer", default="optimizer.json") + parser.add_argument("--train", default=None) + parser.add_argument("--val", default=None) + parser.add_argument("--prompt", default=None) + return parser.parse_args() + + +def main() -> None: + """Entrypoint used by README commands and CI smoke checks.""" + asyncio.run(run_pipeline(parse_args())) + + +if __name__ == "__main__": + main() diff --git a/examples/optimization/eval_optimize_loop/sample_output/optimization_report.sample.json b/examples/optimization/eval_optimize_loop/sample_output/optimization_report.sample.json new file mode 100644 index 0000000..cb74519 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/sample_output/optimization_report.sample.json @@ -0,0 +1,697 @@ +{ + "run": { + "timestamp": "2026-07-03T10:56:29.173807+00:00", + "run_id": "808f8789", + "mode": "fake", + "gepa_seed": 42, + "sdk_bridge": { + "agent_evaluator_available": true, + "agent_optimizer_available": true, + "evalset_validated_with_trpc_sdk": true, + "sdk_import_error": null, + "agent_evaluator_trace_runs": { + "train": { + "status": "FAILED_EXPECTED", + "reason": "[\n {\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"summary\": {\n \"agentName\": \"trace-only\",\n \"evalSetId\": \"eval_optimize_loop_train\",\n \"overallStatus\": \"failed\",\n \"runs\": 1,\n \"evalCases\": [\n {\n \"evalCaseId\": \"train_json_format_ineffective\",\n \"overallStatus\": \"failed\",\n \"metricResults\": [\n {\n \"metricName\": \"final_response_avg_score\",\n \"score\": 0.0,\n \"threshold\": 0.1,\n \"eva", + "evalset": "train.evalset.json", + "has_result": true, + "metrics_file": "_sdk_eval_metrics.json" + }, + "val": { + "status": "FAILED_EXPECTED", + "reason": "[\n {\n \"evalSetId\": \"eval_optimize_loop_val\",\n \"summary\": {\n \"agentName\": \"trace-only\",\n \"evalSetId\": \"eval_optimize_loop_val\",\n \"overallStatus\": \"failed\",\n \"runs\": 1,\n \"evalCases\": [\n {\n \"evalCaseId\": \"val_order_soft_degradation\",\n \"overallStatus\": \"passed\",\n \"metricResults\": [\n {\n \"metricName\": \"final_response_avg_score\",\n \"score\": 1.0,\n \"threshold\": 0.1,\n \"evalStatus", + "evalset": "val.evalset.json", + "has_result": true, + "metrics_file": "_sdk_eval_metrics.json" + } + } + }, + "repro": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json", + "prompt_source": "prompts/system.md", + "optimizer_config": "optimizer.json", + "sdk_optimizer_config": "optimizer.sdk.json" + } + }, + "prompt_audit": { + "target": { + "name": "support_assistant_system", + "path": "prompts/system.md", + "kind": "system_prompt" + }, + "baseline_sha256": "30b490452eeb916fd25950797f0cbe1f9bac2a7b9f738775365c066b43924b88", + "candidate_sha256": "5d3271e9ab855a1bdf0d6af54e6f8521d35a4bd5727e89d632486f826f5f52b9", + "baseline_snapshot": "runs/20260703T105629Z_808f8789/baseline_prompt.md", + "candidate_snapshot": "runs/20260703T105629Z_808f8789/candidate_prompt.md" + }, + "baseline": { + "train": { + "eval_set_id": "eval_optimize_loop_train", + "mean_score": 0.25, + "pass_rate": 0.0, + "cases": { + "train_order_lookup_optimizable": { + "case_id": "train_order_lookup_optimizable", + "score": 0.2, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 0.0, + "rubric": 1.0 + }, + "failure_types": [ + "final_response_mismatch", + "tool_call_error" + ], + "reason": "final_response_mismatch; tool_call_error", + "trace": { + "query": "What is the shipping status for order A100?", + "expected_text": "in transit", + "actual_text": "I do not have enough order data.", + "expected_tools": [ + { + "id": "tool-1", + "name": "lookup_order", + "args": { + "order_id": "A100" + } + } + ], + "actual_tools": [] + } + }, + "train_refund_policy_optimizable": { + "case_id": "train_refund_policy_optimizable", + "score": 0.2, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 0.0, + "rubric": 1.0 + }, + "failure_types": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ], + "reason": "final_response_mismatch; knowledge_recall_insufficient", + "trace": { + "query": "What is the refund policy for damaged items?", + "expected_text": "full refund within 30 days", + "actual_text": "You may be eligible, but I cannot confirm the policy.", + "expected_tools": [ + { + "id": "tool-1", + "name": "search_policy", + "args": { + "topic": "damaged item refund" + } + } + ], + "actual_tools": [] + } + }, + "train_json_format_ineffective": { + "case_id": "train_json_format_ineffective", + "score": 0.35, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 1.0, + "rubric": 0.0 + }, + "failure_types": [ + "final_response_mismatch", + "format_error" + ], + "reason": "final_response_mismatch; format_error", + "trace": { + "query": "Return only JSON: status ok", + "expected_text": "{\"status\":\"ok\"}", + "actual_text": "status ok", + "expected_tools": [], + "actual_tools": [] + } + } + }, + "tokens": 0 + }, + "val": { + "eval_set_id": "eval_optimize_loop_val", + "mean_score": 0.7333, + "pass_rate": 0.6667, + "cases": { + "val_warranty_new_pass": { + "case_id": "val_warranty_new_pass", + "score": 0.2, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 0.0, + "rubric": 1.0 + }, + "failure_types": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ], + "reason": "final_response_mismatch; knowledge_recall_insufficient", + "trace": { + "query": "What is the warranty period for Model Z?", + "expected_text": "24-month warranty", + "actual_text": "I am not sure about the Model Z warranty.", + "expected_tools": [ + { + "id": "tool-1", + "name": "search_policy", + "args": { + "topic": "Model Z warranty" + } + } + ], + "actual_tools": [ + { + "name": "web_search", + "args": { + "query": "Model Z warranty" + } + } + ] + } + }, + "val_smalltalk_regression": { + "case_id": "val_smalltalk_regression", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "Thanks", + "expected_text": "welcome", + "actual_text": "You are welcome.", + "expected_tools": [], + "actual_tools": [] + } + }, + "val_order_soft_degradation": { + "case_id": "val_order_soft_degradation", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "Check order A200.", + "expected_text": "delivered", + "actual_text": "Order A200 is delivered.", + "expected_tools": [ + { + "id": "tool-1", + "name": "lookup_order", + "args": { + "order_id": "A200" + } + } + ], + "actual_tools": [ + { + "name": "lookup_order", + "args": { + "order_id": "A200" + } + } + ] + } + } + }, + "tokens": 0 + } + }, + "candidate": { + "train": { + "eval_set_id": "eval_optimize_loop_train", + "mean_score": 0.7833, + "pass_rate": 0.6667, + "cases": { + "train_order_lookup_optimizable": { + "case_id": "train_order_lookup_optimizable", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": false, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "What is the shipping status for order A100?", + "expected_text": "in transit", + "actual_text": "Order A100 is in transit and arrives on Friday.", + "expected_tools": [ + { + "id": "tool-1", + "name": "lookup_order", + "args": { + "order_id": "A100" + } + } + ], + "actual_tools": [ + { + "name": "lookup_order", + "args": { + "order_id": "A100" + } + } + ] + } + }, + "train_refund_policy_optimizable": { + "case_id": "train_refund_policy_optimizable", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": false, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "What is the refund policy for damaged items?", + "expected_text": "full refund within 30 days", + "actual_text": "Damaged items are eligible for a full refund within 30 days.", + "expected_tools": [ + { + "id": "tool-1", + "name": "search_policy", + "args": { + "topic": "damaged item refund" + } + } + ], + "actual_tools": [ + { + "name": "search_policy", + "args": { + "topic": "damaged item refund" + } + } + ] + } + }, + "train_json_format_ineffective": { + "case_id": "train_json_format_ineffective", + "score": 0.35, + "passed": false, + "hard_fail": true, + "key": false, + "metrics": { + "final_response": 0.0, + "tool_trajectory": 1.0, + "rubric": 0.0 + }, + "failure_types": [ + "final_response_mismatch", + "format_error" + ], + "reason": "final_response_mismatch; format_error", + "trace": { + "query": "Return only JSON: status ok", + "expected_text": "{\"status\":\"ok\"}", + "actual_text": "status ok", + "expected_tools": [], + "actual_tools": [] + } + } + }, + "tokens": 0 + }, + "val": { + "eval_set_id": "eval_optimize_loop_val", + "mean_score": 0.6667, + "pass_rate": 0.3333, + "cases": { + "val_warranty_new_pass": { + "case_id": "val_warranty_new_pass", + "score": 1.0, + "passed": true, + "hard_fail": false, + "key": false, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 1.0, + "rubric": 1.0 + }, + "failure_types": [], + "reason": "pass", + "trace": { + "query": "What is the warranty period for Model Z?", + "expected_text": "24-month warranty", + "actual_text": "Model Z has a 24-month warranty.", + "expected_tools": [ + { + "id": "tool-1", + "name": "search_policy", + "args": { + "topic": "Model Z warranty" + } + } + ], + "actual_tools": [ + { + "name": "search_policy", + "args": { + "topic": "Model Z warranty" + } + } + ] + } + }, + "val_smalltalk_regression": { + "case_id": "val_smalltalk_regression", + "score": 0.45, + "passed": false, + "hard_fail": true, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 0.0, + "rubric": 0.0 + }, + "failure_types": [ + "spurious_tool_call", + "llm_rubric_not_met" + ], + "reason": "spurious_tool_call; llm_rubric_not_met", + "trace": { + "query": "Thanks", + "expected_text": "welcome", + "actual_text": "You are welcome.", + "expected_tools": [], + "actual_tools": [ + { + "name": "search_policy", + "args": { + "topic": "thanks" + } + } + ] + } + }, + "val_order_soft_degradation": { + "case_id": "val_order_soft_degradation", + "score": 0.55, + "passed": false, + "hard_fail": true, + "key": true, + "metrics": { + "final_response": 1.0, + "tool_trajectory": 0.0, + "rubric": 0.5 + }, + "failure_types": [ + "spurious_tool_call", + "llm_rubric_not_met" + ], + "reason": "spurious_tool_call; llm_rubric_not_met", + "trace": { + "query": "Check order A200.", + "expected_text": "delivered", + "actual_text": "Order A200 is delivered.", + "expected_tools": [ + { + "id": "tool-1", + "name": "lookup_order", + "args": { + "order_id": "A200" + } + } + ], + "actual_tools": [ + { + "name": "lookup_order", + "args": { + "order_id": "A200" + } + }, + { + "name": "search_policy", + "args": { + "topic": "order A200" + } + } + ] + } + } + }, + "tokens": 0 + } + }, + "delta": { + "train": { + "train_order_lookup_optimizable": { + "kind": "new_pass", + "baseline_score": 0.2, + "candidate_score": 1.0, + "delta": 0.8, + "baseline_passed": false, + "candidate_passed": true + }, + "train_refund_policy_optimizable": { + "kind": "new_pass", + "baseline_score": 0.2, + "candidate_score": 1.0, + "delta": 0.8, + "baseline_passed": false, + "candidate_passed": true + }, + "train_json_format_ineffective": { + "kind": "same", + "baseline_score": 0.35, + "candidate_score": 0.35, + "delta": 0.0, + "baseline_passed": false, + "candidate_passed": false + } + }, + "val": { + "val_warranty_new_pass": { + "kind": "new_pass", + "baseline_score": 0.2, + "candidate_score": 1.0, + "delta": 0.8, + "baseline_passed": false, + "candidate_passed": true + }, + "val_smalltalk_regression": { + "kind": "new_fail", + "baseline_score": 1.0, + "candidate_score": 0.45, + "delta": -0.55, + "baseline_passed": true, + "candidate_passed": false + }, + "val_order_soft_degradation": { + "kind": "new_fail", + "baseline_score": 1.0, + "candidate_score": 0.55, + "delta": -0.45, + "baseline_passed": true, + "candidate_passed": false + } + } + }, + "failure_attribution": { + "counts": { + "final_response_mismatch": 4, + "tool_call_error": 1, + "knowledge_recall_insufficient": 2, + "format_error": 1 + }, + "by_case": { + "train_order_lookup_optimizable": [ + "final_response_mismatch", + "tool_call_error" + ], + "train_refund_policy_optimizable": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ], + "train_json_format_ineffective": [ + "final_response_mismatch", + "format_error" + ], + "val_warranty_new_pass": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ] + }, + "self_check": { + "cases_with_expected_category": 4, + "matched": 4, + "accuracy": 1.0, + "by_case": { + "train_order_lookup_optimizable": { + "expected": "tool_call_error", + "attributed": [ + "final_response_mismatch", + "tool_call_error" + ], + "matched": true + }, + "train_refund_policy_optimizable": { + "expected": "knowledge_recall_insufficient", + "attributed": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ], + "matched": true + }, + "train_json_format_ineffective": { + "expected": "format_error", + "attributed": [ + "final_response_mismatch", + "format_error" + ], + "matched": true + }, + "val_warranty_new_pass": { + "expected": "knowledge_recall_insufficient", + "attributed": [ + "final_response_mismatch", + "knowledge_recall_insufficient" + ], + "matched": true + } + } + } + }, + "optimizer": { + "mode": "fake", + "status": "SCRIPTED_CANDIDATE", + "agent_optimizer_available": true, + "agent_optimizer_invoked": false, + "candidate_prompt_path": "runs/20260703T105629Z_808f8789/candidate_prompt.md", + "cost_usd": 0.0, + "tokens": 0, + "rounds": 1 + }, + "gate": { + "accepted": false, + "decision": "REJECT", + "reason": "validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down", + "train_gain": 0.5333, + "val_gain": -0.0666, + "checks": [ + { + "name": "validation_gain_threshold", + "passed": false, + "detail": "val_gain=-0.0666, required>=+0.1000" + }, + { + "name": "no_new_hard_fail", + "passed": false, + "detail": "new_hard_fails=['val_smalltalk_regression', 'val_order_soft_degradation']" + }, + { + "name": "no_critical_regression", + "passed": false, + "detail": "critical_regressions=['val_smalltalk_regression', 'val_order_soft_degradation']" + }, + { + "name": "not_overfit_train_up_val_down", + "passed": false, + "detail": "train_gain=+0.5333, val_gain=-0.0666" + }, + { + "name": "cost_budget", + "passed": true, + "detail": "cost_usd=0.000000, budget=0.050000" + } + ] + }, + "audit": { + "duration_seconds": 0.0461, + "cost_usd": 0.0, + "tokens": 0, + "optimizer_cost_usd": 0.0, + "optimizer_tokens": 0, + "eval_cost_usd": 0.0, + "eval_tokens": 0, + "config_snapshot": { + "mode": "fake", + "inputs": { + "train_evalset": "train.evalset.json", + "val_evalset": "val.evalset.json", + "case_meta": "case_meta.json" + }, + "target_prompt": { + "name": "support_assistant_system", + "path": "prompts/system.md", + "kind": "system_prompt" + }, + "evaluate": { + "pass_threshold": 0.8, + "metrics": [ + { + "name": "final_response", + "weight": 0.45 + }, + { + "name": "tool_trajectory", + "weight": 0.35 + }, + { + "name": "rubric", + "weight": 0.2 + } + ] + }, + "optimize": { + "sdk_config": "optimizer.sdk.json", + "update_source": false, + "verbose": 1, + "fake_candidate_patch": [ + "Optimization candidate:", + "- USE_CATALOG_LOOKUP: use lookup_order for order status and search_policy for policy/warranty questions before answering.", + "- AGGRESSIVE_LOOKUP: when uncertain, prefer looking up supporting data even for short or already-answerable requests." + ] + }, + "gate": { + "min_val_score_gain": 0.1, + "reject_on_new_hard_fail": true, + "hard_fail_threshold": 0.6, + "reject_on_critical_regression": true, + "reject_overfit_train_up_val_down": true, + "max_cost_usd": 0.05 + } + } + } +} \ No newline at end of file diff --git a/examples/optimization/eval_optimize_loop/sample_output/optimization_report.sample.md b/examples/optimization/eval_optimize_loop/sample_output/optimization_report.sample.md new file mode 100644 index 0000000..bb78f42 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/sample_output/optimization_report.sample.md @@ -0,0 +1,45 @@ +# Optimization Report + +## Summary + +Decision: REJECT. Train mean changed 0.25 -> 0.7833 (+0.5333); validation mean changed 0.7333 -> 0.6667 (-0.0666). New validation passes: ['val_warranty_new_pass']. New validation failures: ['val_smalltalk_regression', 'val_order_soft_degradation']. Gate reason: validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down. + +## Scores + +- Mode: `fake` +- Baseline train mean: 0.25 +- Candidate train mean: 0.7833 +- Baseline validation mean: 0.7333 +- Candidate validation mean: 0.6667 +- Decision: **REJECT** +- Reason: validation_gain_threshold; no_new_hard_fail; no_critical_regression; not_overfit_train_up_val_down + +## Failure Attribution + +- final_response_mismatch: 4 +- format_error: 1 +- knowledge_recall_insufficient: 2 +- tool_call_error: 1 +- Attribution self-check: 4/4 expected categories matched (accuracy 1.0) + +## Validation Delta + +- `val_warranty_new_pass`: new_pass (0.2 -> 1.0, +0.8000) +- `val_smalltalk_regression`: new_fail (1.0 -> 0.45, -0.5500) +- `val_order_soft_degradation`: new_fail (1.0 -> 0.55, -0.4500) + +## Gate Checks + +- FAIL `validation_gain_threshold`: val_gain=-0.0666, required>=+0.1000 +- FAIL `no_new_hard_fail`: new_hard_fails=['val_smalltalk_regression', 'val_order_soft_degradation'] +- FAIL `no_critical_regression`: critical_regressions=['val_smalltalk_regression', 'val_order_soft_degradation'] +- FAIL `not_overfit_train_up_val_down`: train_gain=+0.5333, val_gain=-0.0666 +- PASS `cost_budget`: cost_usd=0.000000, budget=0.050000 + +## Audit + +- Cost USD: 0.0 (optimizer 0.0, eval 0.0) +- Tokens: 0 (optimizer 0, eval 0) +- Duration seconds: 0.0461 +- Baseline SHA-256: `30b490452eeb916fd25950797f0cbe1f9bac2a7b9f738775365c066b43924b88` +- Candidate SHA-256: `5d3271e9ab855a1bdf0d6af54e6f8521d35a4bd5727e89d632486f826f5f52b9` diff --git a/examples/optimization/eval_optimize_loop/tests/__init__.py b/examples/optimization/eval_optimize_loop/tests/__init__.py new file mode 100644 index 0000000..bc6e483 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/tests/__init__.py @@ -0,0 +1,5 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. diff --git a/examples/optimization/eval_optimize_loop/tests/test_pipeline_units.py b/examples/optimization/eval_optimize_loop/tests/test_pipeline_units.py new file mode 100644 index 0000000..797689d --- /dev/null +++ b/examples/optimization/eval_optimize_loop/tests/test_pipeline_units.py @@ -0,0 +1,325 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Unit tests for the pure decision functions of the eval_optimize_loop example. + +Covered surfaces: failure attribution (``classify_tool_failure`` / +``failure_types_for``), the fake judge rubric (``rubric_score``), the acceptance +gate (``gate_decision``), per-case diffing (``diff_cases``), failure clustering +(``attribute_failures``), and config validation (``validate_config``). All of +them are IO-free, so no model, network, or SDK service is involved. +""" + +from __future__ import annotations + +import copy +import importlib.util +import sys +from pathlib import Path +from typing import Any + +import pytest + + +_EXAMPLE_ROOT = Path(__file__).resolve().parent.parent + +# Load run.py under a unique module name so pytest runs across multiple +# examples cannot collide on a shared "run" module. The module must be in +# sys.modules before exec_module: @dataclass resolves its owning module there. +_SPEC = importlib.util.spec_from_file_location("eval_optimize_loop_run", _EXAMPLE_ROOT / "run.py") +run = importlib.util.module_from_spec(_SPEC) +sys.modules[_SPEC.name] = run +_SPEC.loader.exec_module(run) + + +GATE_CFG: dict[str, Any] = { + "gate": { + "min_val_score_gain": 0.1, + "reject_on_new_hard_fail": True, + "hard_fail_threshold": 0.6, + "reject_on_critical_regression": True, + "reject_overfit_train_up_val_down": True, + "max_cost_usd": 0.05, + } +} + + +def make_case(score: float = 1.0, passed: bool = True, hard_fail: bool = False, key: bool = False) -> dict[str, Any]: + return {"score": score, "passed": passed, "hard_fail": hard_fail, "key": key, "failure_types": []} + + +def make_result(mean_score: float, cases: dict[str, dict[str, Any]]) -> dict[str, Any]: + return {"mean_score": mean_score, "cases": cases} + + +class TestClassifyToolFailure: + def test_matching_trajectory_returns_none(self): + tools = [{"name": "lookup_order", "args": {"order_id": "A100"}}] + assert run.classify_tool_failure(tools, list(tools), {}) is None + + def test_missing_authoritative_tool_is_knowledge_recall(self): + actual = [{"name": "web_search", "args": {"query": "warranty"}}] + expected = [{"name": "search_policy", "args": {"topic": "warranty"}}] + meta = {"authoritative_tool": "search_policy"} + assert run.classify_tool_failure(actual, expected, meta) == "knowledge_recall_insufficient" + + def test_superset_of_expected_calls_is_spurious(self): + expected = [{"name": "lookup_order", "args": {"order_id": "A200"}}] + actual = expected + [{"name": "search_policy", "args": {"topic": "order A200"}}] + assert run.classify_tool_failure(actual, expected, {}) == "spurious_tool_call" + + def test_extra_call_when_none_expected_is_spurious(self): + actual = [{"name": "search_policy", "args": {"topic": "thanks"}}] + assert run.classify_tool_failure(actual, [], {}) == "spurious_tool_call" + + def test_same_leading_tool_with_different_args_is_parameter_error(self): + actual = [{"name": "lookup_order", "args": {"order_id": "A999"}}] + expected = [{"name": "lookup_order", "args": {"order_id": "A100"}}] + assert run.classify_tool_failure(actual, expected, {}) == "parameter_error" + + def test_wrong_tool_is_tool_call_error(self): + actual = [{"name": "web_search", "args": {}}] + expected = [{"name": "lookup_order", "args": {"order_id": "A100"}}] + assert run.classify_tool_failure(actual, expected, {}) == "tool_call_error" + + def test_missing_call_is_tool_call_error(self): + expected = [{"name": "lookup_order", "args": {"order_id": "A100"}}] + assert run.classify_tool_failure([], expected, {}) == "tool_call_error" + + +class TestRubricScore: + def test_json_format_accepts_json_object(self): + assert run.rubric_score({"rubric": "json_format"}, {"text": '{"status": "ok"}', "tools": []}) == 1.0 + + def test_json_format_rejects_plain_text(self): + assert run.rubric_score({"rubric": "json_format"}, {"text": "status ok", "tools": []}) == 0.0 + + def test_json_format_rejects_bare_scalar(self): + assert run.rubric_score({"rubric": "json_format"}, {"text": "123", "tools": []}) == 0.0 + + def test_no_tool_rubric(self): + assert run.rubric_score({"rubric": "no_tool"}, {"text": "hi", "tools": []}) == 1.0 + assert run.rubric_score({"rubric": "no_tool"}, {"text": "hi", "tools": [{"name": "x", "args": {}}]}) == 0.0 + + def test_single_tool_rubric(self): + one = [{"name": "a", "args": {}}] + assert run.rubric_score({"rubric": "single_tool"}, {"text": "", "tools": one}) == 1.0 + assert run.rubric_score({"rubric": "single_tool"}, {"text": "", "tools": one * 2}) == 0.5 + + def test_unset_rubric_defaults_to_full_score(self): + assert run.rubric_score({}, {"text": "anything", "tools": []}) == 1.0 + + +class TestFailureTypesFor: + def test_collects_and_deduplicates_labels(self): + expected = [{"name": "lookup_order", "args": {"order_id": "A100"}}] + output = {"text": "no idea", "tools": []} + labels = run.failure_types_for({"rubric": "json_format"}, 0.0, 0.0, 0.0, output, expected) + assert labels == ["final_response_mismatch", "tool_call_error", "format_error"] + + def test_all_dimensions_passing_yields_no_labels(self): + assert run.failure_types_for({}, 1.0, 1.0, 1.0, {"text": "", "tools": []}, []) == [] + + +class TestGateDecision: + def run_gate( + self, + base_train: float = 0.5, + cand_train: float = 0.5, + baseline_val_cases: dict[str, dict[str, Any]] | None = None, + candidate_val_cases: dict[str, dict[str, Any]] | None = None, + base_val: float = 0.5, + cand_val: float = 0.7, + cost_usd: float = 0.0, + ) -> dict[str, Any]: + baseline_val_cases = baseline_val_cases or {"c1": make_case()} + candidate_val_cases = candidate_val_cases or {"c1": make_case()} + baseline_val = make_result(base_val, baseline_val_cases) + candidate_val = make_result(cand_val, candidate_val_cases) + val_delta = run.diff_cases(baseline_val, candidate_val) + return run.gate_decision( + make_result(base_train, {}), + make_result(cand_train, {}), + baseline_val, + candidate_val, + val_delta, + GATE_CFG, + cost_usd, + ) + + def failed_names(self, gate: dict[str, Any]) -> set[str]: + return {check["name"] for check in gate["checks"] if not check["passed"]} + + def test_accepts_when_all_checks_pass(self): + gate = self.run_gate() + assert gate["decision"] == "ACCEPT" + assert gate["accepted"] is True + assert self.failed_names(gate) == set() + + def test_rejects_insufficient_validation_gain(self): + gate = self.run_gate(cand_val=0.55) + assert gate["decision"] == "REJECT" + assert self.failed_names(gate) == {"validation_gain_threshold"} + + def test_rejects_new_hard_fail(self): + gate = self.run_gate( + baseline_val_cases={"c1": make_case(score=0.9)}, + candidate_val_cases={"c1": make_case(score=0.3, passed=False, hard_fail=True)}, + ) + assert "no_new_hard_fail" in self.failed_names(gate) + + def test_rejects_key_case_regression(self): + gate = self.run_gate( + baseline_val_cases={"c1": make_case(score=1.0, key=True)}, + candidate_val_cases={"c1": make_case(score=0.9, key=True)}, + ) + assert "no_critical_regression" in self.failed_names(gate) + + def test_non_key_case_regression_is_not_critical(self): + gate = self.run_gate( + baseline_val_cases={"c1": make_case(score=1.0)}, + candidate_val_cases={"c1": make_case(score=0.9)}, + ) + assert "no_critical_regression" not in self.failed_names(gate) + + def test_rejects_train_up_validation_down_overfit(self): + gate = self.run_gate(base_train=0.3, cand_train=0.8, base_val=0.7, cand_val=0.6) + assert "not_overfit_train_up_val_down" in self.failed_names(gate) + + def test_rejects_cost_over_budget(self): + gate = self.run_gate(cost_usd=1.0) + assert self.failed_names(gate) == {"cost_budget"} + + +class TestDiffCases: + def test_all_delta_kinds(self): + baseline = make_result( + 0.5, + { + "new_pass": make_case(score=0.4, passed=False), + "new_fail": make_case(score=0.9, passed=True), + "score_up": make_case(score=0.4, passed=False), + "score_down": make_case(score=0.9, passed=True), + "same": make_case(score=0.7, passed=True), + }, + ) + candidate = make_result( + 0.5, + { + "new_pass": make_case(score=0.9, passed=True), + "new_fail": make_case(score=0.4, passed=False), + "score_up": make_case(score=0.5, passed=False), + "score_down": make_case(score=0.8, passed=True), + "same": make_case(score=0.7, passed=True), + }, + ) + delta = run.diff_cases(baseline, candidate) + assert {case_id: item["kind"] for case_id, item in delta.items()} == { + "new_pass": "new_pass", + "new_fail": "new_fail", + "score_up": "score_up", + "score_down": "score_down", + "same": "same", + } + assert delta["score_up"]["delta"] == 0.1 + + def test_mismatched_case_ids_abort_with_readable_error(self): + baseline = make_result(1.0, {"a": make_case()}) + candidate = make_result(1.0, {"b": make_case()}) + with pytest.raises(SystemExit, match="different case ids"): + run.diff_cases(baseline, candidate) + + +class TestAttributeFailures: + def test_counts_failures_and_skips_passed_cases(self): + passed = make_case() + failed = dict(make_case(score=0.2, passed=False), failure_types=["tool_call_error", "format_error"]) + unknown = dict(make_case(score=0.2, passed=False), failure_types=[]) + result_a = make_result(0.5, {"ok": passed, "bad": failed}) + result_b = make_result(0.5, {"mystery": unknown}) + clustered = run.attribute_failures(result_a, result_b) + assert clustered["counts"] == {"tool_call_error": 1, "format_error": 1, "unknown": 1} + assert clustered["by_case"] == {"bad": ["tool_call_error", "format_error"], "mystery": ["unknown"]} + + +class TestAttributionSelfCheck: + def test_accuracy_against_expected_categories(self): + failures = { + "by_case": { + "hit": ["tool_call_error", "final_response_mismatch"], + "miss": ["parameter_error"], + "unlabeled": ["format_error"], + } + } + case_meta = { + "hit": {"category": "tool_call_error"}, + "miss": {"category": "knowledge_recall_insufficient"}, + "unlabeled": {}, + } + check = run.attribution_self_check(failures, case_meta) + assert check["cases_with_expected_category"] == 2 + assert check["matched"] == 1 + assert check["accuracy"] == 0.5 + assert check["by_case"]["hit"]["matched"] is True + assert check["by_case"]["miss"]["matched"] is False + assert "unlabeled" not in check["by_case"] + + def test_no_expected_categories_yields_null_accuracy(self): + check = run.attribution_self_check({"by_case": {"a": ["x"]}}, {"a": {}}) + assert check["accuracy"] is None + assert check["cases_with_expected_category"] == 0 + + def test_shipped_sample_attribution_is_fully_consistent(self): + """The bundled 6-case sample must self-attribute at 100% accuracy.""" + cfg = run.load_json(_EXAMPLE_ROOT / "optimizer.json") + case_meta = { + key: value + for key, value in run.load_json(_EXAMPLE_ROOT / "case_meta.json").items() + if not key.startswith("_") + } + prompt_path = _EXAMPLE_ROOT / "prompts" / "system.md" + + async def evaluate() -> dict[str, Any]: + train = run.validate_evalset(_EXAMPLE_ROOT / "train.evalset.json") + val = run.validate_evalset(_EXAMPLE_ROOT / "val.evalset.json") + baseline_train = await run.evaluate_evalset(train, prompt_path, cfg, case_meta, "fake") + baseline_val = await run.evaluate_evalset(val, prompt_path, cfg, case_meta, "fake") + return run.attribute_failures(baseline_train, baseline_val) + + import asyncio + + failures = asyncio.run(evaluate()) + check = run.attribution_self_check(failures, case_meta) + assert check["cases_with_expected_category"] == 4 + assert check["accuracy"] == 1.0 + + +class TestValidateConfig: + @pytest.fixture() + def valid_cfg(self) -> dict[str, Any]: + return copy.deepcopy(run.load_json(_EXAMPLE_ROOT / "optimizer.json")) + + def test_shipped_config_is_valid(self, valid_cfg: dict[str, Any]): + run.validate_config(valid_cfg) + + def test_rejects_missing_gate_key(self, valid_cfg: dict[str, Any]): + del valid_cfg["gate"]["max_cost_usd"] + with pytest.raises(SystemExit, match="max_cost_usd"): + run.validate_config(valid_cfg) + + def test_rejects_weights_not_summing_to_one(self, valid_cfg: dict[str, Any]): + valid_cfg["evaluate"]["metrics"][0]["weight"] = 0.9 + with pytest.raises(SystemExit, match="sum to 1.0"): + run.validate_config(valid_cfg) + + def test_rejects_unexpected_metric_names(self, valid_cfg: dict[str, Any]): + valid_cfg["evaluate"]["metrics"][0]["name"] = "latency" + with pytest.raises(SystemExit, match="must define exactly"): + run.validate_config(valid_cfg) + + def test_rejects_out_of_range_threshold(self, valid_cfg: dict[str, Any]): + valid_cfg["evaluate"]["pass_threshold"] = 1.5 + with pytest.raises(SystemExit, match="within \\[0, 1\\]"): + run.validate_config(valid_cfg) diff --git a/examples/optimization/eval_optimize_loop/train.evalset.json b/examples/optimization/eval_optimize_loop/train.evalset.json new file mode 100644 index 0000000..d1b5004 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/train.evalset.json @@ -0,0 +1,75 @@ +{ + "eval_set_id": "eval_optimize_loop_train", + "name": "Evaluation optimization loop train set", + "description": "Three training cases: two optimizable failures and one intentionally ineffective JSON-format failure.", + "eval_cases": [ + { + "eval_id": "train_order_lookup_optimizable", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-1", + "user_content": {"role": "user", "parts": [{"text": "What is the shipping status for order A100?"}]}, + "final_response": {"role": "model", "parts": [{"text": "I do not have enough order data."}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-1", + "user_content": {"role": "user", "parts": [{"text": "What is the shipping status for order A100?"}]}, + "final_response": {"role": "model", "parts": [{"text": "in transit"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "lookup_order", "args": {"order_id": "A100"}} + ] + } + } + ] + }, + { + "eval_id": "train_refund_policy_optimizable", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-2", + "user_content": {"role": "user", "parts": [{"text": "What is the refund policy for damaged items?"}]}, + "final_response": {"role": "model", "parts": [{"text": "You may be eligible, but I cannot confirm the policy."}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-2", + "user_content": {"role": "user", "parts": [{"text": "What is the refund policy for damaged items?"}]}, + "final_response": {"role": "model", "parts": [{"text": "full refund within 30 days"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "search_policy", "args": {"topic": "damaged item refund"}} + ] + } + } + ] + }, + { + "eval_id": "train_json_format_ineffective", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-3", + "user_content": {"role": "user", "parts": [{"text": "Return only JSON: status ok"}]}, + "final_response": {"role": "model", "parts": [{"text": "status ok"}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-3", + "user_content": {"role": "user", "parts": [{"text": "Return only JSON: status ok"}]}, + "final_response": {"role": "model", "parts": [{"text": "{\"status\":\"ok\"}"}]}, + "intermediate_data": {"tool_uses": []} + } + ] + } + ] +} diff --git a/examples/optimization/eval_optimize_loop/val.evalset.json b/examples/optimization/eval_optimize_loop/val.evalset.json new file mode 100644 index 0000000..4449ae1 --- /dev/null +++ b/examples/optimization/eval_optimize_loop/val.evalset.json @@ -0,0 +1,83 @@ +{ + "eval_set_id": "eval_optimize_loop_val", + "name": "Evaluation optimization loop validation set", + "description": "Three validation cases: one new pass, one hard regression, and one soft degradation.", + "eval_cases": [ + { + "eval_id": "val_warranty_new_pass", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-1", + "user_content": {"role": "user", "parts": [{"text": "What is the warranty period for Model Z?"}]}, + "final_response": {"role": "model", "parts": [{"text": "I am not sure about the Model Z warranty."}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "web_search", "args": {"query": "Model Z warranty"}} + ] + } + } + ], + "conversation": [ + { + "invocation_id": "expected-1", + "user_content": {"role": "user", "parts": [{"text": "What is the warranty period for Model Z?"}]}, + "final_response": {"role": "model", "parts": [{"text": "24-month warranty"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "search_policy", "args": {"topic": "Model Z warranty"}} + ] + } + } + ] + }, + { + "eval_id": "val_smalltalk_regression", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-2", + "user_content": {"role": "user", "parts": [{"text": "Thanks"}]}, + "final_response": {"role": "model", "parts": [{"text": "You are welcome."}]}, + "intermediate_data": {"tool_uses": []} + } + ], + "conversation": [ + { + "invocation_id": "expected-2", + "user_content": {"role": "user", "parts": [{"text": "Thanks"}]}, + "final_response": {"role": "model", "parts": [{"text": "welcome"}]}, + "intermediate_data": {"tool_uses": []} + } + ] + }, + { + "eval_id": "val_order_soft_degradation", + "eval_mode": "trace", + "actual_conversation": [ + { + "invocation_id": "actual-3", + "user_content": {"role": "user", "parts": [{"text": "Check order A200."}]}, + "final_response": {"role": "model", "parts": [{"text": "Order A200 is delivered."}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "lookup_order", "args": {"order_id": "A200"}} + ] + } + } + ], + "conversation": [ + { + "invocation_id": "expected-3", + "user_content": {"role": "user", "parts": [{"text": "Check order A200."}]}, + "final_response": {"role": "model", "parts": [{"text": "delivered"}]}, + "intermediate_data": { + "tool_uses": [ + {"id": "tool-1", "name": "lookup_order", "args": {"order_id": "A200"}} + ] + } + } + ] + } + ] +}