diff --git a/sdk/ai/azure-ai-projects/CHANGELOG.md b/sdk/ai/azure-ai-projects/CHANGELOG.md index a805c9f54a7f..d5ec6d51cd19 100644 --- a/sdk/ai/azure-ai-projects/CHANGELOG.md +++ b/sdk/ai/azure-ai-projects/CHANGELOG.md @@ -8,6 +8,7 @@ * Added `sample_routines_with_timer_trigger.py` to demonstrate triggering a routine with a timer. * Added `sample_routines_with_schedule_trigger.py` to demonstrate triggering a routine on a recurring cron schedule via `ScheduleRoutineTrigger`. * Updated `sample_dataset_generation_job_traces_for_evaluation.py` and `sample_dataset_generation_job_traces_for_finetuning.py` to create a temporary agent, seed conversations, retry the data generation job over the trace window, and clean up all created resources. +* Updated `sample_multiturn_trace_evaluation_by_id.py`, `sample_multiturn_trace_evaluation_agent_filter.py`, `sample_agent_trace_evaluation_smart_filter.py`, and `sample_evaluations_builtin_with_traces.py` to be self-contained: each sample now creates a temporary agent, seeds conversations, waits for App Insights ingestion, retries the eval over the trace window, and cleans up all created resources (no `FOUNDRY_AGENT_NAME`, `APPINSIGHTS_RESOURCE_ID`, or `AGENT_ID` environment variables required). * Updated `sample_memory_crud.py` and `sample_memory_crud_async.py` to demonstrate memory item CRUD (`create_memory`, `get_memory`, `update_memory`, `list_memories`, `delete_memory`) in addition to memory store CRUD. ## 2.2.0 (2026-05-29) diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py index 983e27916ef2..bec0d32b4f20 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py @@ -6,171 +6,313 @@ """ DESCRIPTION: - Given an AIProjectClient, this sample demonstrates how to evaluate an - agent from its traces by filtering traces from Application Insights using an - agent name/version or agent ID, with smart filtering. + Self-contained sample that evaluates single-turn agent traces selected via + `agent_filter` with `filter_strategy="smart_filtering"`. - Three agent filter forms are supported: - - agent_name + agent_version: Specify the agent by name and version separately. - - agent_id: Specify the agent as a single "name:version" string. - - smart_filtering: Use filter_strategy="smart_filtering" to bias trace - selection toward more interesting conversations. + Steps: + 1. Creates a transient agent. + 2. Seeds a handful of single-turn prompts so the service emits traces + into Application Insights. + 3. Creates a trace-based evaluation group with single-turn evaluators. + 4. Submits an evaluation run with `agent_filter` + (agent_name + agent_version, smart_filtering, time window narrowed to + the seeding interval). Retries the run if Application Insights + ingestion is still in flight. + 5. Cleans up the evaluation, seeded conversations, and agent. + + Prerequisite: the project must have an Application Insights resource + connected so the agent emits server-side traces. + + The `agent_filter` shape also supports passing a single "name:version" + string via `agent_id` (see comment in code). The `--no-smart-filter` flag + disables the smart-filtering strategy if you want to evaluate every + matching trace. USAGE: python sample_agent_trace_evaluation_smart_filter.py - python sample_agent_trace_evaluation_smart_filter.py --agent-id "my-agent:1" + python sample_agent_trace_evaluation_smart_filter.py --no-smart-filter + python sample_agent_trace_evaluation_smart_filter.py --max-traces 3 Before running the sample: - pip install "azure-ai-projects>=2.2.0" python-dotenv + pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv Set these environment variables with your own values: - 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint. - 2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators. - 3) FOUNDRY_AGENT_NAME - Required. The name of the agent whose traces to evaluate. - 4) FOUNDRY_AGENT_VERSION - Optional. The agent version. If not set, latest is used. + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as + found in the overview page of your Microsoft Foundry project. + 2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to + drive the agent during trace seeding and to power the AI-assisted + evaluators. """ import argparse import os import time +import uuid +from datetime import datetime, timezone from pprint import pprint +from typing import List + from dotenv import load_dotenv + from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import TestingCriterionAzureAIEvaluator +from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator load_dotenv() + +AGENT_INSTRUCTIONS = ( + "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the " + "customer's issue when possible. Policies you can quote:\n" + " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n" + " - Exchanges: same window as refunds; exchanges do not include store credit.\n" + " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n" + " - You cannot place orders or process refunds directly; direct the customer to the website " + " or store. Always close with a confirmation that the customer's question is answered." +) +# Single-turn prompts: each prompt is seeded as its own one-turn conversation so +# the service emits one trace span per item. +SINGLE_TURN_PROMPTS: List[str] = [ + "What is the return window for unopened widgets?", + "Do you sell replacement parts for gizmos? How much is shipping for a small part?", + "What is the difference between an exchange and a refund?", + "Can I get a refund for a defective gizmo I bought 60 days ago?", + "How long does a refund take to show up on my card?", +] + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"] -agent_name = os.environ["FOUNDRY_AGENT_NAME"] -agent_version = os.environ.get("FOUNDRY_AGENT_VERSION", "") - -parser = argparse.ArgumentParser(description="Evaluate agent traces using agent filter.") -parser.add_argument("--agent-id", default=None, help='Agent ID in "name:version" format') -parser.add_argument("--max-traces", type=int, default=5, help="Max traces to evaluate (default: 5)") -parser.add_argument("--lookback-hours", type=int, default=24, help="Hours to look back (default: 24)") -args = parser.parse_args() - -with ( - DefaultAzureCredential() as credential, - AIProjectClient(endpoint=endpoint, credential=credential) as project_client, - project_client.get_openai_client() as client, -): - # Eval group for trace-based evaluations - data_source_config = { - "type": "azure_ai_source", - "scenario": "traces", - } - - testing_criteria = [ - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="task_completion", - evaluator_name="builtin.task_completion", - initialization_parameters={"model": model_deployment_name}, - data_mapping={ - "query": "{{item.query}}", - "response": "{{item.response}}", - }, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="conversation_coherence", - evaluator_name="builtin.coherence", - initialization_parameters={"model": model_deployment_name}, - data_mapping={ - "query": "{{item.query}}", - "response": "{{item.response}}", - }, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="groundedness", - evaluator_name="builtin.groundedness", - initialization_parameters={"model": model_deployment_name}, - data_mapping={ - "query": "{{item.query}}", - "response": "{{item.response}}", - }, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="violence", - evaluator_name="builtin.violence", - initialization_parameters={"model": model_deployment_name}, - data_mapping={ - "query": "{{item.query}}", - "response": "{{item.response}}", - }, - ), - ] - - print("Creating trace-based evaluation group") - eval_object = client.evals.create( - name="Trace Evaluation (Agent Smart Filter)", - data_source_config=data_source_config, # type: ignore - testing_criteria=testing_criteria, + +POLL_INTERVAL_SECONDS = 5 +INITIAL_INGEST_WAIT_SECONDS = 60 +MAX_EVAL_ATTEMPTS = 5 +RETRY_WAIT_SECONDS = 60 +# Service constraints for agent_filter trace_source: +# - end_time - start_time must be >= 15 minutes. +# - queries exclude traces whose first/last span is within 5 minutes of +# either window edge, so we need >5 min of padding on each side of the +# actual seeding window. +# - When filter_strategy="smart_filtering" is set, max_traces must be +# between 15 and 1000. Sample seeds fewer than 15 traces; the service +# simply returns what exists. +MIN_AGENT_FILTER_WINDOW_SECONDS = 16 * 60 +AGENT_FILTER_EDGE_BUFFER_SECONDS = 6 * 60 +SMART_FILTERING_MIN_MAX_TRACES = 15 + +TERMINAL_STATUSES = {"completed", "failed", "canceled"} + + +def main() -> None: # pylint: disable=too-many-statements + parser = argparse.ArgumentParser( + description="Evaluate single-turn agent traces using agent_filter + smart_filtering (self-contained)." ) - print(f"Evaluation created (id: {eval_object.id})") - - # Compute time window in unix seconds - # Pad end_time by +600s (10 min) to avoid ingestion-delay edge exclusion - now_unix = int(time.time()) - end_time = now_unix + 600 - start_time = now_unix - (args.lookback_hours * 3600) - - # Build trace_source based on mode - trace_source: dict = { - "type": "agent_filter", - "start_time": start_time, - "end_time": end_time, - "max_traces": args.max_traces, - "filter_strategy": "smart_filtering", - } - - if args.agent_id: - trace_source["agent_id"] = args.agent_id - print(f"Using agent_id filter: {args.agent_id}") - else: - trace_source["agent_name"] = agent_name - if agent_version: - trace_source["agent_version"] = agent_version - print(f"Using agent filter: {agent_name} v{agent_version or '(latest)'}") - - data_source = { - "type": "azure_ai_trace_data_source_preview", - "trace_source": trace_source, - } - - eval_run = client.evals.runs.create( - eval_id=eval_object.id, - name="trace-evaluation-agent-smart-filter-run", - data_source=data_source, # type: ignore + parser.add_argument( + "--no-smart-filter", + action="store_true", + help="Disable filter_strategy='smart_filtering' (evaluate every matching trace).", ) - print(f"Evaluation run created (id: {eval_run.id})") - - while True: - run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) - if run.status in ("completed", "failed"): - break - print(f"Waiting for eval run to complete... current status: {run.status}") - time.sleep(5) - - if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") - print(f"Result Counts: {run.result_counts}") - - output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) - print(f"\nOUTPUT ITEMS (Total: {len(output_items)})") - print(f"{'-'*60}") - pprint(output_items) - print(f"{'-'*60}") - - print(f"\nEval Run Report URL: {run.report_url}") - else: - print(f"\n✗ Evaluation run failed: {run.error}") - - client.evals.delete(eval_id=eval_object.id) - print("Evaluation deleted") + parser.add_argument( + "--max-traces", + type=int, + default=len(SINGLE_TURN_PROMPTS), + help=f"Max traces to evaluate (default: {len(SINGLE_TURN_PROMPTS)} = one per seeded prompt).", + ) + args = parser.parse_args() + smart_filter = not args.no_smart_filter + effective_max_traces = args.max_traces + if smart_filter and effective_max_traces < SMART_FILTERING_MIN_MAX_TRACES: + print( + f"smart_filtering requires max_traces in [{SMART_FILTERING_MIN_MAX_TRACES}, 1000]; " + f"bumping --max-traces from {effective_max_traces} to {SMART_FILTERING_MIN_MAX_TRACES}." + ) + effective_max_traces = SMART_FILTERING_MIN_MAX_TRACES + + run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" + agent_name = f"st-trace-smart-filter-{run_id}" + + with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as client, + ): + + created_agent = None + created_conversation_ids: List[str] = [] + eval_object = None + + try: + # 1. Create an agent that traces will be filtered to. + print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).") + created_agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS), + ) + print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") + + # 2. Seed single-turn prompts and capture the seeding window. + # Pre-seed buffer must exceed the service's 5-min edge exclusion. + seed_start_unix = int(time.time()) - AGENT_FILTER_EDGE_BUFFER_SECONDS + print(f"Seed {len(SINGLE_TURN_PROMPTS)} single-turn prompt(s) against the agent.") + for prompt in SINGLE_TURN_PROMPTS: + conversation = client.conversations.create() + created_conversation_ids.append(conversation.id) + print(f" - conversation id: {conversation.id} (prompt: {prompt!r})") + client.responses.create( + conversation=conversation.id, + input=prompt, + extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}}, + ) + + print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True) + time.sleep(INITIAL_INGEST_WAIT_SECONDS) + + # 3. Create the trace-based evaluation group (single-turn evaluators). + data_source_config = { + "type": "azure_ai_source", + "scenario": "traces", + } + + testing_criteria = [ + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="task_completion", + evaluator_name="builtin.task_completion", + initialization_parameters={"model": model_deployment_name}, + data_mapping={ + "query": "{{item.query}}", + "response": "{{item.response}}", + }, + ), + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="coherence", + evaluator_name="builtin.coherence", + initialization_parameters={"model": model_deployment_name}, + data_mapping={ + "query": "{{item.query}}", + "response": "{{item.response}}", + }, + ), + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="violence", + evaluator_name="builtin.violence", + initialization_parameters={"model": model_deployment_name}, + data_mapping={ + "query": "{{item.query}}", + "response": "{{item.response}}", + }, + ), + ] + + print("Create trace-based evaluation group.") + eval_object = client.evals.create( + name=f"Trace Evaluation (Agent Smart Filter) {run_id}", + data_source_config=data_source_config, # type: ignore + testing_criteria=testing_criteria, + ) + print(f"Evaluation created (id: {eval_object.id}).") + + # 4. Submit eval runs with agent_filter narrowed to the seeding window. + # Pad end_time so the last seeded span is >5 min from the upper edge + # and enforce the service-side 15-min minimum window. + run = None + for attempt in range(1, MAX_EVAL_ATTEMPTS + 1): + end_time_unix = max( + int(time.time()) + AGENT_FILTER_EDGE_BUFFER_SECONDS, + seed_start_unix + MIN_AGENT_FILTER_WINDOW_SECONDS, + ) + + trace_source = { + "type": "agent_filter", + "agent_name": created_agent.name, + "agent_version": str(created_agent.version), + "start_time": seed_start_unix, + "end_time": end_time_unix, + "max_traces": effective_max_traces, + } + # Alternative shape: pass a single "name:version" string via `agent_id`: + # trace_source["agent_id"] = f"{created_agent.name}:{created_agent.version}" + if smart_filter: + trace_source["filter_strategy"] = "smart_filtering" + + data_source = { + "type": "azure_ai_trace_data_source_preview", + "trace_source": trace_source, + } + + print( + f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) for agent " + f"`{created_agent.name}` v{created_agent.version} " + f"(window: {seed_start_unix}..{end_time_unix}, max_traces={effective_max_traces}" + f"{', smart_filtering' if smart_filter else ''})." + ) + eval_run = client.evals.runs.create( + eval_id=eval_object.id, + name=f"agent-smart-filter-{run_id}-a{attempt}", + data_source=data_source, # type: ignore + ) + print(f"Eval run created (id: {eval_run.id}).") + + print("Poll eval run until terminal.", end="", flush=True) + while True: + run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) + if run.status in TERMINAL_STATUSES: + break + time.sleep(POLL_INTERVAL_SECONDS) + print(".", end="", flush=True) + print() + print(f"Final run status: `{run.status}`.") + + if run.status == "completed": + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + if output_items: + print(f"Run produced {len(output_items)} output item(s).") + print(f"Result counts: {run.result_counts}") + print(f"{'-' * 60}") + pprint(output_items) + print(f"{'-' * 60}") + print(f"Eval run report URL: {run.report_url}") + break + print( + f"Run completed but produced 0 output items " + f"(result counts: {run.result_counts}); traces likely not yet ingested." + ) + else: + print(f"Run did not complete (status: `{run.status}`, error: {run.error}).") + + if attempt == MAX_EVAL_ATTEMPTS: + raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.") + print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True) + time.sleep(RETRY_WAIT_SECONDS) + + finally: + # Best-effort cleanup: eval object -> seeded conversations -> agent. + if eval_object is not None: + try: + client.evals.delete(eval_id=eval_object.id) + print(f"Deleted evaluation `{eval_object.id}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete evaluation: {exc}") + + for cid in created_conversation_ids: + try: + client.conversations.delete(conversation_id=cid) + print(f"Deleted seeded conversation `{cid}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete conversation `{cid}`: {exc}") + + if created_agent is not None: + try: + project_client.agents.delete_version( + agent_name=created_agent.name, + agent_version=created_agent.version, + ) + print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete agent: {exc}") + + +if __name__ == "__main__": + main() diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py index a5e88ab53301..a2a7d9ec3cc2 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py @@ -1,4 +1,4 @@ -# pylint: disable=line-too-long,useless-suppression,docstring-missing-param,docstring-missing-return,docstring-missing-rtype,unused-argument +# pylint: disable=line-too-long,useless-suppression # ------------------------------------ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. @@ -6,75 +6,98 @@ """ DESCRIPTION: - Given an AIProjectClient, this sample demonstrates how to run Azure AI Evaluations - against agent traces collected in Azure Application Insights. - - Supports three modes: - - Default mode (no flags): Queries Application Insights client-side for trace IDs - using the AGENT_ID environment variable, then passes them to the eval service. - - Agent ID mode (--agent-id): Passes the agent ID directly to the eval service, - which resolves traces server-side from Application Insights. - - Trace ID mode (--trace-ids): Passes explicit trace IDs to the eval service. + Self-contained sample that runs Azure AI built-in evaluators against agent + traces resolved server-side by `agent_id`. + + Steps: + 1. Creates a transient agent. + 2. Seeds a few single-turn prompts so the service emits traces into + Application Insights. + 3. Creates a trace-based evaluation group with single-turn evaluators. + 4. Submits an evaluation run that uses the `azure_ai_traces` data source + with `agent_id=":"`; the service resolves traces + server-side. Retries the run if Application Insights ingestion is + still in flight. + 5. Cleans up the evaluation, seeded conversations, and agent. + + Prerequisite: the project must have an Application Insights resource + connected so the agent emits server-side traces. No `APPINSIGHTS_RESOURCE_ID` + or `AGENT_ID` env vars are required - everything is self-contained. USAGE: python sample_evaluations_builtin_with_traces.py - python sample_evaluations_builtin_with_traces.py --agent-id "my-agent:1" - python sample_evaluations_builtin_with_traces.py --trace-ids abc123 def456 - python sample_evaluations_builtin_with_traces.py --agent-id "my-agent:1" --lookback-hours 48 --max-traces 20 - python sample_evaluations_builtin_with_traces.py --no-cleanup + python sample_evaluations_builtin_with_traces.py --max-traces 10 + python sample_evaluations_builtin_with_traces.py --lookback-hours 2 Before running the sample: - pip install "azure-ai-projects>=2.0.0" python-dotenv azure-monitor-query + pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv Set these environment variables with your own values: - 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your - Microsoft Foundry project. It has the form: https://.services.ai.azure.com/api/projects/. - 2) APPINSIGHTS_RESOURCE_ID - Required (for default mode). The Azure Application Insights resource ID that stores - agent traces. Not needed when using --agent-id or --trace-ids. - It has the form: /subscriptions//resourceGroups//providers/Microsoft.Insights/components/. - 3) AGENT_ID - Required. The agent identifier emitted by the Azure tracing integration, used to filter traces. - 4) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name to use with the built-in evaluators. - 5) TRACE_LOOKBACK_HOURS - Optional. Number of hours to look back when querying traces and in the evaluation run. - Defaults to 1. + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as + found in the overview page of your Microsoft Foundry project. + 2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to + drive the agent during trace seeding and to power the AI-assisted + evaluators. """ import argparse import os import time -from datetime import datetime, timedelta, timezone +import uuid +from datetime import datetime, timezone from pprint import pprint -from typing import Any, Dict, List, Optional +from typing import List + from dotenv import load_dotenv + from azure.identity import DefaultAzureCredential -from azure.monitor.query import LogsQueryClient, LogsQueryStatus from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import ( - TestingCriterionAzureAIEvaluator, -) +from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator load_dotenv() +AGENT_INSTRUCTIONS = ( + "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the " + "customer's issue when possible. Policies you can quote:\n" + " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n" + " - Exchanges: same window as refunds; exchanges do not include store credit.\n" + " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n" + " - You cannot place orders or process refunds directly; direct the customer to the website " + " or store. Always close with a confirmation that the customer's question is answered." +) +# Single-turn prompts: each prompt is seeded as its own one-turn conversation so +# the service emits one trace span per item. +SINGLE_TURN_PROMPTS: List[str] = [ + "I bought a widget last week and it stopped working - what are my options?", + "What is the return window for unopened widgets?", + "Can I get store credit if I exchange a defective gizmo?", + "How much does shipping cost for a small replacement part?", + "How long does a refund take to show up on my card?", +] + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] -appinsights_resource_id = os.environ[ - "APPINSIGHTS_RESOURCE_ID" -] # Sample : /subscriptions//resourceGroups//providers/Microsoft.Insights/components/ -agent_id = os.environ["AGENT_ID"] model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"] -default_lookback_hours = int(os.environ.get("TRACE_LOOKBACK_HOURS", "1")) +POLL_INTERVAL_SECONDS = 5 +INITIAL_INGEST_WAIT_SECONDS = 60 +MAX_EVAL_ATTEMPTS = 5 +RETRY_WAIT_SECONDS = 60 + +TERMINAL_STATUSES = {"completed", "failed", "canceled"} -def _build_evaluator_config(name: str, evaluator_name: str) -> TestingCriterionAzureAIEvaluator: - """Create a standard Azure AI evaluator configuration block for trace evaluations.""" + +def _build_evaluator(name: str, evaluator_name: str) -> TestingCriterionAzureAIEvaluator: + """Standard single-turn evaluator config for the `azure_ai_traces` data source.""" return TestingCriterionAzureAIEvaluator( type="azure_ai_evaluator", name=name, evaluator_name=evaluator_name, data_mapping={ - "query": "{{sample.query}}", - "response": "{{sample.response}}", - "tool_definitions": "{{sample.tool_definitions}}", + "query": "{{item.query}}", + "response": "{{item.response}}", + "tool_definitions": "{{item.tool_definitions}}", }, initialization_parameters={ "deployment_name": model_deployment_name, @@ -82,181 +105,163 @@ def _build_evaluator_config(name: str, evaluator_name: str) -> TestingCriterionA ) -def get_trace_ids( - appinsight_resource_id: str, tracked_agent_id: str, start_time: datetime, end_time: datetime -) -> List[str]: - """ - Query Application Insights for trace IDs (operation_Id) based on agent ID and time range. - - Args: - appinsight_resource_id: The resource ID of the Application Insights instance. - tracked_agent_id: The agent ID to filter by. - start_time: Start time for the query. - end_time: End time for the query. - - Returns: - List of distinct operation IDs (trace IDs). - """ - query = """ -dependencies -| where timestamp between (datetime({start_time.isoformat()}) .. datetime({end_time.isoformat()})) -| extend agent_id = tostring(customDimensions["gen_ai.agent.id"]) -| where agent_id == "{tracked_agent_id}" -| distinct operation_Id -""" - - try: - with DefaultAzureCredential() as credential: - client = LogsQueryClient(credential) - response = client.query_resource( - appinsight_resource_id, - query=query, - timespan=None, # Time range is specified in the query itself. - ) - except Exception as exc: # pylint: disable=broad-except - print(f"Error executing query: {exc}") - return [] - - if response.status == LogsQueryStatus.SUCCESS: - trace_ids: List[str] = [] - for table in response.tables: - for row in table.rows: - trace_ids.append(row[0]) - return trace_ids - - print(f"Query failed with status: {response.status}") - if response.partial_error: - print(f"Partial error: {response.partial_error}") - return [] - - def main() -> None: # pylint: disable=too-many-statements - parser = argparse.ArgumentParser(description="Run Azure AI trace evaluations against agent traces.") - mode = parser.add_mutually_exclusive_group() - mode.add_argument("--agent-id", default=None, help="Agent ID for server-side trace resolution") - mode.add_argument("--trace-ids", nargs="+", default=None, help="Explicit trace IDs to evaluate") - parser.add_argument("--lookback-hours", type=int, default=None, help="Lookback window in hours") - parser.add_argument("--max-traces", type=int, default=50, help="Max traces in agent-id mode (default: 50)") - parser.add_argument("--no-cleanup", action="store_true", help="Keep eval group after run") + parser = argparse.ArgumentParser( + description="Run built-in trace evaluators against an agent's traces (self-contained)." + ) + parser.add_argument( + "--max-traces", + type=int, + default=len(SINGLE_TURN_PROMPTS), + help=f"Max traces to evaluate (default: {len(SINGLE_TURN_PROMPTS)} = one per seeded prompt).", + ) + parser.add_argument( + "--lookback-hours", + type=int, + default=1, + help="Hours to look back when resolving traces server-side (default: 1).", + ) args = parser.parse_args() - lookback_hours = args.lookback_hours or default_lookback_hours - trace_ids: Optional[List[str]] = None - agent_id_for_server: Optional[str] = None - metadata: Dict[str, str] = {} - - if args.agent_id: - agent_id_for_server = args.agent_id - print("Mode: Server-side agent ID resolution") - print(f"Agent ID: {args.agent_id}") - print(f"Lookback: {lookback_hours}h, Max traces: {args.max_traces}") - metadata["agent_id"] = args.agent_id - - elif args.trace_ids: - trace_ids = list(args.trace_ids) - print(f"Mode: Explicit trace IDs ({len(trace_ids)} provided)") - - else: - end_time = datetime.now(tz=timezone.utc) - start_time = end_time - timedelta(hours=lookback_hours) - - print("Querying Application Insights for trace identifiers...") - print(f"Agent ID: {agent_id}") - print(f"Time range: {start_time.isoformat()} to {end_time.isoformat()}") - - trace_ids = get_trace_ids(appinsights_resource_id, agent_id, start_time, end_time) - - if not trace_ids: - print("No trace IDs found for the provided agent and time window.") - return - - print(f"\nFound {len(trace_ids)} trace IDs:") - for tid in trace_ids: - print(f" - {tid}") - - metadata["agent_id"] = agent_id - metadata["start_time"] = start_time.isoformat() - metadata["end_time"] = end_time.isoformat() - - with DefaultAzureCredential() as credential: - with AIProjectClient(endpoint=endpoint, credential=credential) as project_client: - client = project_client.get_openai_client() - + run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" + agent_name = f"builtin-traces-{run_id}" + + with ( + DefaultAzureCredential() as credential, + AIProjectClient(endpoint=endpoint, credential=credential) as project_client, + project_client.get_openai_client() as client, + ): + + created_agent = None + created_conversation_ids: List[str] = [] + eval_object = None + + try: + # 1. Create an agent that traces will be filtered to. + print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).") + created_agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS), + ) + print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") + + # 2. Seed single-turn prompts so the service emits traces. + print(f"Seed {len(SINGLE_TURN_PROMPTS)} single-turn prompt(s) against the agent.") + for prompt in SINGLE_TURN_PROMPTS: + conversation = client.conversations.create() + created_conversation_ids.append(conversation.id) + print(f" - conversation id: {conversation.id} (prompt: {prompt!r})") + client.responses.create( + conversation=conversation.id, + input=prompt, + extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}}, + ) + + print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True) + time.sleep(INITIAL_INGEST_WAIT_SECONDS) + + # 3. Create the trace-based evaluation group (single-turn evaluators). data_source_config = { "type": "azure_ai_source", "scenario": "traces", } testing_criteria = [ - _build_evaluator_config( - name="intent_resolution", - evaluator_name="builtin.intent_resolution", - ), - _build_evaluator_config( - name="task_adherence", - evaluator_name="builtin.task_adherence", - ), + _build_evaluator(name="intent_resolution", evaluator_name="builtin.intent_resolution"), + _build_evaluator(name="task_adherence", evaluator_name="builtin.task_adherence"), ] - print("\nCreating evaluation") + print("Create trace-based evaluation group.") eval_object = client.evals.create( - name="agent_trace_eval_group", + name=f"Builtin Trace Evaluation {run_id}", data_source_config=data_source_config, # type: ignore testing_criteria=testing_criteria, # type: ignore ) - print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})") - - print("\nGet Evaluation by Id") - eval_object_response = client.evals.retrieve(eval_object.id) - print("Evaluation Response:") - pprint(eval_object_response) - - # Build data source based on mode - if agent_id_for_server: - data_source: Dict[str, Any] = { + print(f"Evaluation created (id: {eval_object.id}).") + + # 4. Submit eval runs using the `azure_ai_traces` data source with + # agent_id set to ":"; the service resolves matching + # traces server-side from Application Insights. + agent_id_for_server = f"{created_agent.name}:{created_agent.version}" + run = None + for attempt in range(1, MAX_EVAL_ATTEMPTS + 1): + data_source = { "type": "azure_ai_traces", "agent_id": agent_id_for_server, - "lookback_hours": lookback_hours, + "lookback_hours": args.lookback_hours, "max_traces": args.max_traces, } - else: - assert trace_ids is not None - data_source = { - "type": "azure_ai_traces", - "trace_ids": trace_ids, - "lookback_hours": lookback_hours, - } - print("\nCreating Eval Run") - run_name = f"agent_trace_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}" - eval_run_object = client.evals.runs.create( - eval_id=eval_object.id, - name=run_name, - metadata=metadata if metadata else None, - data_source=data_source, # type: ignore - ) - print("Eval Run created") - pprint(eval_run_object) - - print("\nMonitoring Eval Run status...") - while True: - run = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id) - print(f"Status: {run.status}") - - if run.status in {"completed", "failed", "canceled"}: - print("\nEval Run finished!") - print("Final Eval Run Response:") - pprint(run) - break - - time.sleep(5) - print("Waiting for eval run to complete...") - - if not args.no_cleanup: - client.evals.delete(eval_id=eval_object.id) - print("Evaluation deleted") - else: - print(f"Skipping cleanup (--no-cleanup). Eval ID: {eval_object.id}") + print( + f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) for agent_id " + f"`{agent_id_for_server}` (lookback_hours={args.lookback_hours}, " + f"max_traces={args.max_traces})." + ) + eval_run = client.evals.runs.create( + eval_id=eval_object.id, + name=f"builtin-traces-{run_id}-a{attempt}", + metadata={"agent_id": agent_id_for_server}, + data_source=data_source, # type: ignore + ) + print(f"Eval run created (id: {eval_run.id}).") + + print("Poll eval run until terminal.", end="", flush=True) + while True: + run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) + if run.status in TERMINAL_STATUSES: + break + time.sleep(POLL_INTERVAL_SECONDS) + print(".", end="", flush=True) + print() + print(f"Final run status: `{run.status}`.") + + if run.status == "completed": + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + if output_items: + print(f"Run produced {len(output_items)} output item(s).") + print(f"Result counts: {run.result_counts}") + print(f"{'-' * 60}") + pprint(output_items) + print(f"{'-' * 60}") + print(f"Eval run report URL: {run.report_url}") + break + print( + f"Run completed but produced 0 output items " + f"(result counts: {run.result_counts}); traces likely not yet ingested." + ) + else: + print(f"Run did not complete (status: `{run.status}`, error: {run.error}).") + + if attempt == MAX_EVAL_ATTEMPTS: + raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.") + print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True) + time.sleep(RETRY_WAIT_SECONDS) + + finally: + # Best-effort cleanup: eval object -> seeded conversations -> agent. + if eval_object is not None: + try: + client.evals.delete(eval_id=eval_object.id) + print(f"Deleted evaluation `{eval_object.id}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete evaluation: {exc}") + + for cid in created_conversation_ids: + try: + client.conversations.delete(conversation_id=cid) + print(f"Deleted seeded conversation `{cid}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete conversation `{cid}`: {exc}") + + if created_agent is not None: + try: + project_client.agents.delete_version( + agent_name=created_agent.name, + agent_version=created_agent.version, + ) + print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete agent: {exc}") if __name__ == "__main__": diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py index 7a65c9f6b4ba..bd78bff71e23 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py @@ -6,177 +6,299 @@ """ DESCRIPTION: - Given an AIProjectClient, this sample demonstrates how to evaluate multi-turn - agent conversations by filtering traces from Application Insights using an - agent name/version or agent ID, with optional smart filtering. + Self-contained sample that evaluates multi-turn agent conversations by + filtering Application Insights traces for a specific agent over a time + window. - This is Scenario 3 of multi-turn evaluations: instead of providing specific - conversation or trace IDs, you specify an agent identity and a time window. - The service samples traces from App Insights matching that agent and evaluates - the reconstructed conversations. + Steps: + 1. Creates a transient agent. + 2. Seeds a few multi-turn conversations against the agent so the service + emits traces into Application Insights. + 3. Creates a trace-based evaluation group with conversation-level + evaluators. + 4. Submits an evaluation run with `agent_filter` (agent_name + + agent_version, time window narrowed to the seeding interval). + Retries the run if Application Insights ingestion is still in flight. + 5. Cleans up the evaluation, seeded conversations, and agent. - Three agent filter forms are supported: - - agent_name + agent_version: Specify the agent by name and version separately. - - agent_id: Specify the agent as a single "name:version" string. - - smart_filtering: Use filter_strategy="smart_filtering" to bias trace - selection toward more interesting conversations. + Prerequisite: the project must have an Application Insights resource + connected so the agent emits server-side traces. + + The `agent_filter` shape also supports: + - `agent_id`: a single "name:version" string (see comment in code). + - `filter_strategy="smart_filtering"`: biases trace selection toward more + interesting conversations (enabled via --smart-filter). USAGE: python sample_multiturn_trace_evaluation_agent_filter.py - python sample_multiturn_trace_evaluation_agent_filter.py --agent-id "my-agent:1" python sample_multiturn_trace_evaluation_agent_filter.py --smart-filter + python sample_multiturn_trace_evaluation_agent_filter.py --max-traces 5 Before running the sample: - pip install "azure-ai-projects>=2.0.0" python-dotenv + pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv Set these environment variables with your own values: - 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint. - 2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators. - 3) FOUNDRY_AGENT_NAME - Required. The name of the agent whose traces to evaluate. - 4) FOUNDRY_AGENT_VERSION - Optional. The agent version. If not set, latest is used. + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as + found in the overview page of your Microsoft Foundry project. + 2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to + drive the agent during trace seeding and to power the AI-assisted + evaluators. """ import argparse import os import time +import uuid +from datetime import datetime, timezone from pprint import pprint +from typing import List + from dotenv import load_dotenv + from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import TestingCriterionAzureAIEvaluator +from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator load_dotenv() + +AGENT_INSTRUCTIONS = ( + "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the " + "customer's issue when possible. Policies you can quote:\n" + " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n" + " - Exchanges: same window as refunds; exchanges do not include store credit.\n" + " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n" + " - You cannot place orders or process refunds directly; direct the customer to the website " + " or store. Always close with a confirmation that the customer's question is answered." +) +CONVERSATION_FLOWS: List[List[str]] = [ + [ + "I bought a widget last week and it stopped working.", + "It is past the 30 day mark, can I still return it?", + "How long will the refund take to process?", + "Thanks, that answers my question.", + ], + [ + "Do you sell replacement parts for gizmos?", + "How much does shipping cost for a small part?", + "Got it, I will order it from the website. Thank you.", + ], + [ + "What is the difference between an exchange and a refund?", + "If I exchange a defective gizmo, do I also get store credit?", + "Understood, thanks for clarifying.", + ], +] + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"] -agent_name = os.environ["FOUNDRY_AGENT_NAME"] -agent_version = os.environ.get("FOUNDRY_AGENT_VERSION", "") + +POLL_INTERVAL_SECONDS = 5 +INITIAL_INGEST_WAIT_SECONDS = 60 +MAX_EVAL_ATTEMPTS = 5 +RETRY_WAIT_SECONDS = 60 +# Service constraints for agent_filter trace_source: +# - end_time - start_time must be >= 15 minutes. +# - conversation-level queries exclude conversations whose first/last span is +# within 5 minutes of either window edge, so we need >5 min of padding on +# each side of the actual seeding window. +MIN_AGENT_FILTER_WINDOW_SECONDS = 16 * 60 +AGENT_FILTER_EDGE_BUFFER_SECONDS = 6 * 60 + +TERMINAL_STATUSES = {"completed", "failed", "canceled"} -def main() -> None: - parser = argparse.ArgumentParser(description="Evaluate agent traces using agent filter.") - parser.add_argument("--agent-id", default=None, help='Agent ID in "name:version" format') +def main() -> None: # pylint: disable=too-many-statements + parser = argparse.ArgumentParser(description="Evaluate agent traces using agent_filter (self-contained).") parser.add_argument("--smart-filter", action="store_true", help="Use smart_filtering strategy") - parser.add_argument("--max-traces", type=int, default=5, help="Max traces to evaluate (default: 5)") - parser.add_argument("--lookback-hours", type=int, default=24, help="Hours to look back (default: 24)") + parser.add_argument( + "--max-traces", + type=int, + default=len(CONVERSATION_FLOWS), + help=f"Max traces to evaluate (default: {len(CONVERSATION_FLOWS)} = one per seeded conversation)", + ) args = parser.parse_args() + run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" + agent_name = f"mt-trace-agent-filter-{run_id}" + with ( DefaultAzureCredential() as credential, AIProjectClient(endpoint=endpoint, credential=credential) as project_client, project_client.get_openai_client() as client, ): - # Eval group for trace-based evaluations - data_source_config = { - "type": "azure_ai_source", - "scenario": "traces", - } - - testing_criteria = [ - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="customer_satisfaction", - evaluator_name="builtin.customer_satisfaction", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="task_completion", - evaluator_name="builtin.task_completion", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="conversation_coherence", - evaluator_name="builtin.coherence", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="groundedness", - evaluator_name="builtin.groundedness", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - ] - - print("Creating trace-based evaluation group") - eval_object = client.evals.create( - name="Multi-turn Trace Evaluation (Agent Filter)", - data_source_config=data_source_config, # type: ignore - testing_criteria=testing_criteria, - ) - print(f"Evaluation created (id: {eval_object.id})") - - # Compute time window in unix seconds - # Pad end_time by +600s (10 min) to avoid ingestion-delay edge exclusion - now_unix = int(time.time()) - end_time = now_unix + 600 - start_time = now_unix - (args.lookback_hours * 3600) - - # Build trace_source based on mode - trace_source: dict = { - "type": "agent_filter", - "start_time": start_time, - "end_time": end_time, - "max_traces": args.max_traces, - } - - if args.agent_id: - # agent_id form: single "name:version" string - trace_source["agent_id"] = args.agent_id - print(f"Using agent_id filter: {args.agent_id}") - else: - # agent_name + agent_version form - trace_source["agent_name"] = agent_name - if agent_version: - trace_source["agent_version"] = agent_version - print(f"Using agent filter: {agent_name} v{agent_version or '(latest)'}") - - if args.smart_filter: - trace_source["filter_strategy"] = "smart_filtering" - print("Filter strategy: smart_filtering") - - data_source = { - "type": "azure_ai_trace_data_source_preview", - "trace_source": trace_source, - } - - eval_run = client.evals.runs.create( - eval_id=eval_object.id, - name="multiturn-agent-filter-run", - data_source=data_source, # type: ignore - extra_body={"evaluation_level": "conversation"}, - ) - print(f"Evaluation run created (id: {eval_run.id})") - - while True: - run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) - if run.status in ("completed", "failed"): - break - print(f"Waiting for eval run to complete... current status: {run.status}") - time.sleep(5) - - if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") - print(f"Result Counts: {run.result_counts}") - - output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) - print(f"\nOUTPUT ITEMS (Total: {len(output_items)})") - print(f"{'-'*60}") - pprint(output_items) - print(f"{'-'*60}") - - print(f"\nEval Run Report URL: {run.report_url}") - else: - print(f"\n✗ Evaluation run failed: {run.error}") - - client.evals.delete(eval_id=eval_object.id) - print("Evaluation deleted") + + created_agent = None + created_conversation_ids: List[str] = [] + eval_object = None + + try: + # 1. Create an agent that traces will be filtered to. + print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).") + created_agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS), + ) + print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") + + # 2. Seed multi-turn conversations and capture the seeding window. + # Pre-seed buffer must exceed the service's 5-min edge exclusion for + # conversation-level queries. + seed_start_unix = int(time.time()) - AGENT_FILTER_EDGE_BUFFER_SECONDS + print(f"Seed {len(CONVERSATION_FLOWS)} multi-turn conversation(s) against the agent.") + for flow in CONVERSATION_FLOWS: + conversation = client.conversations.create() + created_conversation_ids.append(conversation.id) + print(f" - conversation id: {conversation.id} ({len(flow)} turn(s))") + for turn in flow: + client.responses.create( + conversation=conversation.id, + input=turn, + extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}}, + ) + + print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True) + time.sleep(INITIAL_INGEST_WAIT_SECONDS) + + # 3. Create the trace-based evaluation group (conversation-level evaluators). + data_source_config = { + "type": "azure_ai_source", + "scenario": "traces", + } + + testing_criteria = [ + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="customer_satisfaction", + evaluator_name="builtin.customer_satisfaction", + initialization_parameters={"model": model_deployment_name}, + data_mapping={"messages": "{{item.messages}}"}, + ), + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="task_completion", + evaluator_name="builtin.task_completion", + initialization_parameters={"model": model_deployment_name}, + data_mapping={"messages": "{{item.messages}}"}, + ), + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="conversation_coherence", + evaluator_name="builtin.coherence", + initialization_parameters={"model": model_deployment_name}, + data_mapping={"messages": "{{item.messages}}"}, + ), + ] + + print("Create trace-based evaluation group.") + eval_object = client.evals.create( + name=f"Multi-turn Trace Evaluation (Agent Filter) {run_id}", + data_source_config=data_source_config, # type: ignore + testing_criteria=testing_criteria, + ) + print(f"Evaluation created (id: {eval_object.id}).") + + # 4. Submit eval runs with agent_filter narrowed to the seeding window. + # Pad end_time so the last seeded span is >5 min from the upper edge + # (conversation-level edge exclusion) and enforce the service-side + # 15-min minimum window. + run = None + for attempt in range(1, MAX_EVAL_ATTEMPTS + 1): + end_time_unix = max( + int(time.time()) + AGENT_FILTER_EDGE_BUFFER_SECONDS, + seed_start_unix + MIN_AGENT_FILTER_WINDOW_SECONDS, + ) + + trace_source = { + "type": "agent_filter", + "agent_name": created_agent.name, + "agent_version": str(created_agent.version), + "start_time": seed_start_unix, + "end_time": end_time_unix, + "max_traces": args.max_traces, + } + # Alternative shape: pass a single "name:version" string via `agent_id`: + # trace_source["agent_id"] = f"{created_agent.name}:{created_agent.version}" + if args.smart_filter: + trace_source["filter_strategy"] = "smart_filtering" + + data_source = { + "type": "azure_ai_trace_data_source_preview", + "trace_source": trace_source, + } + + print( + f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) for agent " + f"`{created_agent.name}` v{created_agent.version} " + f"(window: {seed_start_unix}..{end_time_unix}, max_traces={args.max_traces}" + f"{', smart_filtering' if args.smart_filter else ''})." + ) + eval_run = client.evals.runs.create( + eval_id=eval_object.id, + name=f"multiturn-agent-filter-{run_id}-a{attempt}", + data_source=data_source, # type: ignore + extra_body={"evaluation_level": "conversation"}, + ) + print(f"Eval run created (id: {eval_run.id}).") + + print("Poll eval run until terminal.", end="", flush=True) + while True: + run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) + if run.status in TERMINAL_STATUSES: + break + time.sleep(POLL_INTERVAL_SECONDS) + print(".", end="", flush=True) + print() + print(f"Final run status: `{run.status}`.") + + if run.status == "completed": + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + if output_items: + print(f"Run produced {len(output_items)} output item(s).") + print(f"Result counts: {run.result_counts}") + print(f"{'-' * 60}") + pprint(output_items) + print(f"{'-' * 60}") + print(f"Eval run report URL: {run.report_url}") + break + print( + f"Run completed but produced 0 output items " + f"(result counts: {run.result_counts}); traces likely not yet ingested." + ) + else: + print(f"Run did not complete (status: `{run.status}`, error: {run.error}).") + + if attempt == MAX_EVAL_ATTEMPTS: + raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.") + print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True) + time.sleep(RETRY_WAIT_SECONDS) + + finally: + # Best-effort cleanup: eval object -> seeded conversations -> agent. + if eval_object is not None: + try: + client.evals.delete(eval_id=eval_object.id) + print(f"Deleted evaluation `{eval_object.id}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete evaluation: {exc}") + + for cid in created_conversation_ids: + try: + client.conversations.delete(conversation_id=cid) + print(f"Deleted seeded conversation `{cid}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete conversation `{cid}`: {exc}") + + if created_agent is not None: + try: + project_client.agents.delete_version( + agent_name=created_agent.name, + agent_version=created_agent.version, + ) + print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete agent: {exc}") if __name__ == "__main__": diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py index f8117be7ae3b..355d2f89b2d8 100644 --- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py +++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py @@ -6,161 +6,263 @@ """ DESCRIPTION: - Given an AIProjectClient, this sample demonstrates how to evaluate multi-turn - conversations captured as agent traces in Application Insights, using specific - conversation IDs or trace IDs to select which conversations to evaluate. + Self-contained sample that evaluates multi-turn agent conversations captured + as agent traces in Application Insights, selecting them by Foundry + conversation IDs. - This is Scenario 2 of multi-turn evaluations: you provide known conversation - or trace identifiers, and the service reconstructs the messages from App Insights - traces, then runs conversation-level evaluators against them. + Steps: + 1. Creates a transient agent. + 2. Seeds a few multi-turn conversations against the agent so that the + service emits traces into Application Insights. + 3. Creates a trace-based evaluation group with conversation-level + evaluators. + 4. Submits an evaluation run that targets the seeded conversations by + `conversation_id_source`. Retries the run if the traces have not + finished ingesting into App Insights yet. + 5. Cleans up the evaluation, seeded conversations, and agent. - Two modes are supported: - - conversation_id_source: Provide Foundry conversation IDs. - - trace_id_source: Provide W3C trace IDs (operation_Id from App Insights). + Prerequisite: the project must have an Application Insights resource + connected so the agent emits server-side traces. + + Two `trace_source` shapes are supported by the service: + - `conversation_id_source` - the Foundry conversation IDs returned by + `openai_client.conversations.create()` (used here). + - `trace_id_source` - W3C trace IDs (`operation_Id` from App Insights); + see the commented snippet below for the alternative shape. USAGE: python sample_multiturn_trace_evaluation_by_id.py Before running the sample: - pip install "azure-ai-projects>=2.0.0" python-dotenv + pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv Set these environment variables with your own values: - 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint. - 2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators. - 3) FOUNDRY_CONVERSATION_IDS - Required (for conversation_id mode). Comma-separated - Foundry conversation IDs to evaluate. - Example: "conv_abc123,conv_def456,conv_ghi789" - 4) FOUNDRY_TRACE_IDS - Optional (for trace_id mode). Comma-separated W3C trace IDs. - If set, overrides conversation IDs. + 1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as + found in the overview page of your Microsoft Foundry project. + 2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to + drive the agent during trace seeding and to power the AI-assisted + evaluators. """ import os import time +import uuid +from datetime import datetime, timezone from pprint import pprint +from typing import List + from dotenv import load_dotenv + from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient -from azure.ai.projects.models import TestingCriterionAzureAIEvaluator +from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator load_dotenv() + +AGENT_INSTRUCTIONS = ( + "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the " + "customer's issue when possible. Policies you can quote:\n" + " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n" + " - Exchanges: same window as refunds; exchanges do not include store credit.\n" + " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n" + " - You cannot place orders or process refunds directly; direct the customer to the website " + " or store. Always close with a confirmation that the customer's question is answered." +) +# Each entry is one conversation. Multi-turn conversations exercise the +# conversation-level evaluators (task_completion, customer_satisfaction, ...). +# The final user turn closes the conversation so task_completion can recognize +# the agent reached a resolution. +CONVERSATION_FLOWS: List[List[str]] = [ + [ + "I bought a widget last week and it stopped working.", + "It is past the 30 day mark, can I still return it?", + "How long will the refund take to process?", + "Thanks, that answers my question.", + ], + [ + "Do you sell replacement parts for gizmos?", + "How much does shipping cost for a small part?", + "Got it, I will order it from the website. Thank you.", + ], + [ + "What is the difference between an exchange and a refund?", + "If I exchange a defective gizmo, do I also get store credit?", + "Understood, thanks for clarifying.", + ], +] + endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"] model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"] -# Choose one: conversation IDs or trace IDs -conversation_ids_str = os.environ.get("FOUNDRY_CONVERSATION_IDS", "") -trace_ids_str = os.environ.get("FOUNDRY_TRACE_IDS", "") +POLL_INTERVAL_SECONDS = 5 +INITIAL_INGEST_WAIT_SECONDS = 60 +MAX_EVAL_ATTEMPTS = 5 +RETRY_WAIT_SECONDS = 60 + +# Per-run id keeps the agent name unique across repeated runs. +run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}" +agent_name = f"mt-trace-by-id-{run_id}" + +TERMINAL_STATUSES = {"completed", "failed", "canceled"} + with ( DefaultAzureCredential() as credential, AIProjectClient(endpoint=endpoint, credential=credential) as project_client, project_client.get_openai_client() as client, ): - # Eval group for trace-based evaluations uses azure_ai_source with scenario "traces" - data_source_config = { - "type": "azure_ai_source", - "scenario": "traces", - } - - # Conversation-level evaluators for trace data - testing_criteria = [ - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="customer_satisfaction", - evaluator_name="builtin.customer_satisfaction", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="task_completion", - evaluator_name="builtin.task_completion", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="conversation_coherence", - evaluator_name="builtin.coherence", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - TestingCriterionAzureAIEvaluator( - type="azure_ai_evaluator", - name="groundedness", - evaluator_name="builtin.groundedness", - initialization_parameters={"model": model_deployment_name}, - data_mapping={"messages": "{{item.messages}}"}, - ), - ] - - print("Creating trace-based evaluation group") - eval_object = client.evals.create( - name="Multi-turn Trace Evaluation (by ID)", - data_source_config=data_source_config, # type: ignore - testing_criteria=testing_criteria, - ) - print(f"Evaluation created (id: {eval_object.id})") - - # Build the data source based on which IDs are provided - if trace_ids_str: - # Trace ID mode — provide W3C trace IDs (operation_Id from App Insights) - trace_ids = [tid.strip() for tid in trace_ids_str.split(",") if tid.strip()] - print(f"Using {len(trace_ids)} trace IDs") - data_source = { - "type": "azure_ai_trace_data_source_preview", - "trace_source": { - "type": "trace_id_source", - "trace_ids": trace_ids, - }, + + created_agent = None + created_conversation_ids: List[str] = [] + eval_object = None + + try: + # 1. Create an agent to attribute the seeded conversations to. + print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).") + created_agent = project_client.agents.create_version( + agent_name=agent_name, + definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS), + ) + print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).") + + # 2. Seed multi-turn conversations against the agent. + print(f"Seed {len(CONVERSATION_FLOWS)} multi-turn conversation(s) against the agent.") + for flow in CONVERSATION_FLOWS: + conversation = client.conversations.create() + created_conversation_ids.append(conversation.id) + print(f" - conversation id: {conversation.id} ({len(flow)} turn(s))") + for turn in flow: + client.responses.create( + conversation=conversation.id, + input=turn, + extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}}, + ) + + print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True) + time.sleep(INITIAL_INGEST_WAIT_SECONDS) + + # 3. Create the trace-based evaluation group (conversation-level evaluators). + data_source_config = { + "type": "azure_ai_source", + "scenario": "traces", } - else: - # Conversation ID mode — provide Foundry conversation IDs - conversation_ids = [cid.strip() for cid in conversation_ids_str.split(",") if cid.strip()] - if not conversation_ids: - raise ValueError( - "Set FOUNDRY_CONVERSATION_IDS or FOUNDRY_TRACE_IDS. " - "These are IDs from prior agent interactions captured in App Insights." - ) - print(f"Using {len(conversation_ids)} conversation IDs") + + testing_criteria = [ + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="customer_satisfaction", + evaluator_name="builtin.customer_satisfaction", + initialization_parameters={"model": model_deployment_name}, + data_mapping={"messages": "{{item.messages}}"}, + ), + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="task_completion", + evaluator_name="builtin.task_completion", + initialization_parameters={"model": model_deployment_name}, + data_mapping={"messages": "{{item.messages}}"}, + ), + TestingCriterionAzureAIEvaluator( + type="azure_ai_evaluator", + name="conversation_coherence", + evaluator_name="builtin.coherence", + initialization_parameters={"model": model_deployment_name}, + data_mapping={"messages": "{{item.messages}}"}, + ), + ] + + print("Create trace-based evaluation group.") + eval_object = client.evals.create( + name=f"Multi-turn Trace Evaluation (by ID) {run_id}", + data_source_config=data_source_config, # type: ignore + testing_criteria=testing_criteria, + ) + print(f"Evaluation created (id: {eval_object.id}).") + + # 4. Submit an eval run that targets the seeded conversations by ID. + # Retry: ingestion delay can leave the conversations invisible to the + # eval service even after the initial wait. data_source = { "type": "azure_ai_trace_data_source_preview", "trace_source": { "type": "conversation_id_source", - "conversation_ids": conversation_ids, + "conversation_ids": created_conversation_ids, }, } + # Alternative shape (requires W3C trace IDs from App Insights): + # "trace_source": {"type": "trace_id_source", "trace_ids": ["", ...]} + + run = None + for attempt in range(1, MAX_EVAL_ATTEMPTS + 1): + print( + f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) over " + f"{len(created_conversation_ids)} conversation id(s)." + ) + eval_run = client.evals.runs.create( + eval_id=eval_object.id, + name=f"multiturn-trace-by-id-{run_id}-a{attempt}", + data_source=data_source, # type: ignore + extra_body={"evaluation_level": "conversation"}, + ) + print(f"Eval run created (id: {eval_run.id}).") + + print("Poll eval run until terminal.", end="", flush=True) + while True: + run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) + if run.status in TERMINAL_STATUSES: + break + time.sleep(POLL_INTERVAL_SECONDS) + print(".", end="", flush=True) + print() + print(f"Final run status: `{run.status}`.") + + if run.status == "completed": + output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) + expected_items = len(created_conversation_ids) + if len(output_items) >= expected_items: + print(f"Run produced {len(output_items)} output item(s) (>= {expected_items} expected).") + print(f"Result counts: {run.result_counts}") + print(f"{'-' * 60}") + pprint(output_items) + print(f"{'-' * 60}") + print(f"Eval run report URL: {run.report_url}") + break + print( + f"Run completed but produced {len(output_items)}/{expected_items} output items " + f"(result counts: {run.result_counts}); traces likely not yet fully ingested." + ) + else: + print(f"Run did not complete (status: `{run.status}`, error: {run.error}).") + + if attempt == MAX_EVAL_ATTEMPTS: + raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.") + print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True) + time.sleep(RETRY_WAIT_SECONDS) + + finally: + # Best-effort cleanup: eval object -> seeded conversations -> agent. + if eval_object is not None: + try: + client.evals.delete(eval_id=eval_object.id) + print(f"Deleted evaluation `{eval_object.id}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete evaluation: {exc}") + + for cid in created_conversation_ids: + try: + client.conversations.delete(conversation_id=cid) + print(f"Deleted seeded conversation `{cid}`.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete conversation `{cid}`: {exc}") - # Create run with evaluation_level = "conversation" - eval_run = client.evals.runs.create( - eval_id=eval_object.id, - name="multiturn-trace-by-id-run", - data_source=data_source, # type: ignore - extra_body={"evaluation_level": "conversation"}, - ) - print(f"Evaluation run created (id: {eval_run.id})") - - while True: - run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id) - if run.status in ("completed", "failed"): - break - print(f"Waiting for eval run to complete... current status: {run.status}") - time.sleep(5) - - if run.status == "completed": - print("\n✓ Evaluation run completed successfully!") - print(f"Result Counts: {run.result_counts}") - - output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id)) - print(f"\nOUTPUT ITEMS (Total: {len(output_items)})") - print(f"{'-'*60}") - pprint(output_items) - print(f"{'-'*60}") - - print(f"\nEval Run Report URL: {run.report_url}") - else: - print(f"\n✗ Evaluation run failed: {run.error}") - - client.evals.delete(eval_id=eval_object.id) - print("Evaluation deleted") + if created_agent is not None: + try: + project_client.agents.delete_version( + agent_name=created_agent.name, + agent_version=created_agent.version, + ) + print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.") + except Exception as exc: # pylint: disable=broad-exception-caught + print(f" (warning) could not delete agent: {exc}") diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py index b311fa011822..4a5814a170f1 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py @@ -123,8 +123,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase): get_bearer_token_provider() which is incompatible with mock credentials. External service dependencies (require additional Azure services): - - sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and - uses azure-monitor-query to fetch traces. + - sample_evaluations_builtin_with_traces.py: Seeds agent conversations and waits for + Application Insights to ingest the resulting traces before running the eval; the + real-time ingestion wait is not suitable for recorded playback. - sample_scheduled_evaluations.py: Requires Azure RBAC assignment via azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset. - sample_human_evaluations.py: Requires Azure Application Insights (fetches @@ -146,7 +147,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase): samples_to_skip=[ "sample_evaluations_ai_assisted.py", # Similarity evaluator returns FAILED_EXECUTION ('query' is missing) "sample_evaluations_builtin_with_inline_data_oai.py", # 401 AuthenticationError (invalid subscription key or API endpoint) - "sample_evaluations_builtin_with_traces.py", # Missing required env var APPINSIGHTS_RESOURCE_ID (KeyError) + "sample_evaluations_builtin_with_traces.py", # Self-contained sample seeds traces then waits for real App Insights ingestion; not suitable for playback "sample_evaluations_score_model_grader_with_image.py", # Eval fails: image inputs not supported for configured grader model "sample_evaluations_score_model_grader_with_image_model_target.py", # Eval fails: image inputs not supported for configured grader model "sample_evaluations_score_model_grader_with_audio.py", # Eval fails: audio inputs not supported for configured grader model