diff --git a/sdk/ai/azure-ai-projects/CHANGELOG.md b/sdk/ai/azure-ai-projects/CHANGELOG.md
index a805c9f54a7f..d5ec6d51cd19 100644
--- a/sdk/ai/azure-ai-projects/CHANGELOG.md
+++ b/sdk/ai/azure-ai-projects/CHANGELOG.md
@@ -8,6 +8,7 @@
 * Added `sample_routines_with_timer_trigger.py` to demonstrate triggering a routine with a timer.
 * Added `sample_routines_with_schedule_trigger.py` to demonstrate triggering a routine on a recurring cron schedule via `ScheduleRoutineTrigger`.
 * Updated `sample_dataset_generation_job_traces_for_evaluation.py` and `sample_dataset_generation_job_traces_for_finetuning.py` to create a temporary agent, seed conversations, retry the data generation job over the trace window, and clean up all created resources.
+* Updated `sample_multiturn_trace_evaluation_by_id.py`, `sample_multiturn_trace_evaluation_agent_filter.py`, `sample_agent_trace_evaluation_smart_filter.py`, and `sample_evaluations_builtin_with_traces.py` to be self-contained: each sample now creates a temporary agent, seeds conversations, waits for App Insights ingestion, retries the eval over the trace window, and cleans up all created resources (no `FOUNDRY_AGENT_NAME`, `APPINSIGHTS_RESOURCE_ID`, or `AGENT_ID` environment variables required).
 * Updated `sample_memory_crud.py` and `sample_memory_crud_async.py` to demonstrate memory item CRUD (`create_memory`, `get_memory`, `update_memory`, `list_memories`, `delete_memory`) in addition to memory store CRUD.
 
 ## 2.2.0 (2026-05-29)
diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py
index 983e27916ef2..bec0d32b4f20 100644
--- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py
+++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_agent_trace_evaluation_smart_filter.py
@@ -6,171 +6,313 @@
 
 """
 DESCRIPTION:
-    Given an AIProjectClient, this sample demonstrates how to evaluate an
-    agent from its traces by filtering traces from Application Insights using an
-    agent name/version or agent ID, with smart filtering.
+    Self-contained sample that evaluates single-turn agent traces selected via
+    `agent_filter` with `filter_strategy="smart_filtering"`.
 
-    Three agent filter forms are supported:
-      - agent_name + agent_version: Specify the agent by name and version separately.
-      - agent_id: Specify the agent as a single "name:version" string.
-      - smart_filtering: Use filter_strategy="smart_filtering" to bias trace
-        selection toward more interesting conversations.
+    Steps:
+      1. Creates a transient agent.
+      2. Seeds a handful of single-turn prompts so the service emits traces
+         into Application Insights.
+      3. Creates a trace-based evaluation group with single-turn evaluators.
+      4. Submits an evaluation run with `agent_filter`
+         (agent_name + agent_version, smart_filtering, time window narrowed to
+         the seeding interval). Retries the run if Application Insights
+         ingestion is still in flight.
+      5. Cleans up the evaluation, seeded conversations, and agent.
+
+    Prerequisite: the project must have an Application Insights resource
+    connected so the agent emits server-side traces.
+
+    The `agent_filter` shape also supports passing a single "name:version"
+    string via `agent_id` (see comment in code). The `--no-smart-filter` flag
+    disables the smart-filtering strategy if you want to evaluate every
+    matching trace.
 
 USAGE:
     python sample_agent_trace_evaluation_smart_filter.py
-    python sample_agent_trace_evaluation_smart_filter.py --agent-id "my-agent:1"
+    python sample_agent_trace_evaluation_smart_filter.py --no-smart-filter
+    python sample_agent_trace_evaluation_smart_filter.py --max-traces 3
 
     Before running the sample:
 
-    pip install "azure-ai-projects>=2.2.0" python-dotenv
+    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv
 
     Set these environment variables with your own values:
-    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint.
-    2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators.
-    3) FOUNDRY_AGENT_NAME - Required. The name of the agent whose traces to evaluate.
-    4) FOUNDRY_AGENT_VERSION - Optional. The agent version. If not set, latest is used.
+    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
+       found in the overview page of your Microsoft Foundry project.
+    2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to
+       drive the agent during trace seeding and to power the AI-assisted
+       evaluators.
 """
 
 import argparse
 import os
 import time
+import uuid
+from datetime import datetime, timezone
 from pprint import pprint
+from typing import List
+
 from dotenv import load_dotenv
+
 from azure.identity import DefaultAzureCredential
 from azure.ai.projects import AIProjectClient
-from azure.ai.projects.models import TestingCriterionAzureAIEvaluator
+from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator
 
 load_dotenv()
 
+
+AGENT_INSTRUCTIONS = (
+    "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the "
+    "customer's issue when possible. Policies you can quote:\n"
+    " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n"
+    " - Exchanges: same window as refunds; exchanges do not include store credit.\n"
+    " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n"
+    " - You cannot place orders or process refunds directly; direct the customer to the website "
+    "   or store. Always close with a confirmation that the customer's question is answered."
+)
+# Single-turn prompts: each prompt is seeded as its own one-turn conversation so
+# the service emits one trace span per item.
+SINGLE_TURN_PROMPTS: List[str] = [
+    "What is the return window for unopened widgets?",
+    "Do you sell replacement parts for gizmos? How much is shipping for a small part?",
+    "What is the difference between an exchange and a refund?",
+    "Can I get a refund for a defective gizmo I bought 60 days ago?",
+    "How long does a refund take to show up on my card?",
+]
+
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
 model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"]
-agent_name = os.environ["FOUNDRY_AGENT_NAME"]
-agent_version = os.environ.get("FOUNDRY_AGENT_VERSION", "")
-
-parser = argparse.ArgumentParser(description="Evaluate agent traces using agent filter.")
-parser.add_argument("--agent-id", default=None, help='Agent ID in "name:version" format')
-parser.add_argument("--max-traces", type=int, default=5, help="Max traces to evaluate (default: 5)")
-parser.add_argument("--lookback-hours", type=int, default=24, help="Hours to look back (default: 24)")
-args = parser.parse_args()
-
-with (
-    DefaultAzureCredential() as credential,
-    AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
-    project_client.get_openai_client() as client,
-):
-    # Eval group for trace-based evaluations
-    data_source_config = {
-        "type": "azure_ai_source",
-        "scenario": "traces",
-    }
-
-    testing_criteria = [
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="task_completion",
-            evaluator_name="builtin.task_completion",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={
-                "query": "{{item.query}}",
-                "response": "{{item.response}}",
-            },
-        ),
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="conversation_coherence",
-            evaluator_name="builtin.coherence",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={
-                "query": "{{item.query}}",
-                "response": "{{item.response}}",
-            },
-        ),
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="groundedness",
-            evaluator_name="builtin.groundedness",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={
-                "query": "{{item.query}}",
-                "response": "{{item.response}}",
-            },
-        ),
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="violence",
-            evaluator_name="builtin.violence",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={
-                "query": "{{item.query}}",
-                "response": "{{item.response}}",
-            },
-        ),
-    ]
-
-    print("Creating trace-based evaluation group")
-    eval_object = client.evals.create(
-        name="Trace Evaluation (Agent Smart Filter)",
-        data_source_config=data_source_config,  # type: ignore
-        testing_criteria=testing_criteria,
+
+POLL_INTERVAL_SECONDS = 5
+INITIAL_INGEST_WAIT_SECONDS = 60
+MAX_EVAL_ATTEMPTS = 5
+RETRY_WAIT_SECONDS = 60
+# Service constraints for agent_filter trace_source:
+#   - end_time - start_time must be >= 15 minutes.
+#   - queries exclude traces whose first/last span is within 5 minutes of
+#     either window edge, so we need >5 min of padding on each side of the
+#     actual seeding window.
+#   - When filter_strategy="smart_filtering" is set, max_traces must be
+#     between 15 and 1000. Sample seeds fewer than 15 traces; the service
+#     simply returns what exists.
+MIN_AGENT_FILTER_WINDOW_SECONDS = 16 * 60
+AGENT_FILTER_EDGE_BUFFER_SECONDS = 6 * 60
+SMART_FILTERING_MIN_MAX_TRACES = 15
+
+TERMINAL_STATUSES = {"completed", "failed", "canceled"}
+
+
+def main() -> None:  # pylint: disable=too-many-statements
+    parser = argparse.ArgumentParser(
+        description="Evaluate single-turn agent traces using agent_filter + smart_filtering (self-contained)."
     )
-    print(f"Evaluation created (id: {eval_object.id})")
-
-    # Compute time window in unix seconds
-    # Pad end_time by +600s (10 min) to avoid ingestion-delay edge exclusion
-    now_unix = int(time.time())
-    end_time = now_unix + 600
-    start_time = now_unix - (args.lookback_hours * 3600)
-
-    # Build trace_source based on mode
-    trace_source: dict = {
-        "type": "agent_filter",
-        "start_time": start_time,
-        "end_time": end_time,
-        "max_traces": args.max_traces,
-        "filter_strategy": "smart_filtering",
-    }
-
-    if args.agent_id:
-        trace_source["agent_id"] = args.agent_id
-        print(f"Using agent_id filter: {args.agent_id}")
-    else:
-        trace_source["agent_name"] = agent_name
-        if agent_version:
-            trace_source["agent_version"] = agent_version
-        print(f"Using agent filter: {agent_name} v{agent_version or '(latest)'}")
-
-    data_source = {
-        "type": "azure_ai_trace_data_source_preview",
-        "trace_source": trace_source,
-    }
-
-    eval_run = client.evals.runs.create(
-        eval_id=eval_object.id,
-        name="trace-evaluation-agent-smart-filter-run",
-        data_source=data_source,  # type: ignore
+    parser.add_argument(
+        "--no-smart-filter",
+        action="store_true",
+        help="Disable filter_strategy='smart_filtering' (evaluate every matching trace).",
     )
-    print(f"Evaluation run created (id: {eval_run.id})")
-
-    while True:
-        run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
-        if run.status in ("completed", "failed"):
-            break
-        print(f"Waiting for eval run to complete... current status: {run.status}")
-        time.sleep(5)
-
-    if run.status == "completed":
-        print("\n✓ Evaluation run completed successfully!")
-        print(f"Result Counts: {run.result_counts}")
-
-        output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
-        print(f"\nOUTPUT ITEMS (Total: {len(output_items)})")
-        print(f"{'-'*60}")
-        pprint(output_items)
-        print(f"{'-'*60}")
-
-        print(f"\nEval Run Report URL: {run.report_url}")
-    else:
-        print(f"\n✗ Evaluation run failed: {run.error}")
-
-    client.evals.delete(eval_id=eval_object.id)
-    print("Evaluation deleted")
+    parser.add_argument(
+        "--max-traces",
+        type=int,
+        default=len(SINGLE_TURN_PROMPTS),
+        help=f"Max traces to evaluate (default: {len(SINGLE_TURN_PROMPTS)} = one per seeded prompt).",
+    )
+    args = parser.parse_args()
+    smart_filter = not args.no_smart_filter
+    effective_max_traces = args.max_traces
+    if smart_filter and effective_max_traces < SMART_FILTERING_MIN_MAX_TRACES:
+        print(
+            f"smart_filtering requires max_traces in [{SMART_FILTERING_MIN_MAX_TRACES}, 1000]; "
+            f"bumping --max-traces from {effective_max_traces} to {SMART_FILTERING_MIN_MAX_TRACES}."
+        )
+        effective_max_traces = SMART_FILTERING_MIN_MAX_TRACES
+
+    run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
+    agent_name = f"st-trace-smart-filter-{run_id}"
+
+    with (
+        DefaultAzureCredential() as credential,
+        AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
+        project_client.get_openai_client() as client,
+    ):
+
+        created_agent = None
+        created_conversation_ids: List[str] = []
+        eval_object = None
+
+        try:
+            # 1. Create an agent that traces will be filtered to.
+            print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).")
+            created_agent = project_client.agents.create_version(
+                agent_name=agent_name,
+                definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS),
+            )
+            print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
+
+            # 2. Seed single-turn prompts and capture the seeding window.
+            # Pre-seed buffer must exceed the service's 5-min edge exclusion.
+            seed_start_unix = int(time.time()) - AGENT_FILTER_EDGE_BUFFER_SECONDS
+            print(f"Seed {len(SINGLE_TURN_PROMPTS)} single-turn prompt(s) against the agent.")
+            for prompt in SINGLE_TURN_PROMPTS:
+                conversation = client.conversations.create()
+                created_conversation_ids.append(conversation.id)
+                print(f"  - conversation id: {conversation.id} (prompt: {prompt!r})")
+                client.responses.create(
+                    conversation=conversation.id,
+                    input=prompt,
+                    extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}},
+                )
+
+            print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True)
+            time.sleep(INITIAL_INGEST_WAIT_SECONDS)
+
+            # 3. Create the trace-based evaluation group (single-turn evaluators).
+            data_source_config = {
+                "type": "azure_ai_source",
+                "scenario": "traces",
+            }
+
+            testing_criteria = [
+                TestingCriterionAzureAIEvaluator(
+                    type="azure_ai_evaluator",
+                    name="task_completion",
+                    evaluator_name="builtin.task_completion",
+                    initialization_parameters={"model": model_deployment_name},
+                    data_mapping={
+                        "query": "{{item.query}}",
+                        "response": "{{item.response}}",
+                    },
+                ),
+                TestingCriterionAzureAIEvaluator(
+                    type="azure_ai_evaluator",
+                    name="coherence",
+                    evaluator_name="builtin.coherence",
+                    initialization_parameters={"model": model_deployment_name},
+                    data_mapping={
+                        "query": "{{item.query}}",
+                        "response": "{{item.response}}",
+                    },
+                ),
+                TestingCriterionAzureAIEvaluator(
+                    type="azure_ai_evaluator",
+                    name="violence",
+                    evaluator_name="builtin.violence",
+                    initialization_parameters={"model": model_deployment_name},
+                    data_mapping={
+                        "query": "{{item.query}}",
+                        "response": "{{item.response}}",
+                    },
+                ),
+            ]
+
+            print("Create trace-based evaluation group.")
+            eval_object = client.evals.create(
+                name=f"Trace Evaluation (Agent Smart Filter) {run_id}",
+                data_source_config=data_source_config,  # type: ignore
+                testing_criteria=testing_criteria,
+            )
+            print(f"Evaluation created (id: {eval_object.id}).")
+
+            # 4. Submit eval runs with agent_filter narrowed to the seeding window.
+            # Pad end_time so the last seeded span is >5 min from the upper edge
+            # and enforce the service-side 15-min minimum window.
+            run = None
+            for attempt in range(1, MAX_EVAL_ATTEMPTS + 1):
+                end_time_unix = max(
+                    int(time.time()) + AGENT_FILTER_EDGE_BUFFER_SECONDS,
+                    seed_start_unix + MIN_AGENT_FILTER_WINDOW_SECONDS,
+                )
+
+                trace_source = {
+                    "type": "agent_filter",
+                    "agent_name": created_agent.name,
+                    "agent_version": str(created_agent.version),
+                    "start_time": seed_start_unix,
+                    "end_time": end_time_unix,
+                    "max_traces": effective_max_traces,
+                }
+                # Alternative shape: pass a single "name:version" string via `agent_id`:
+                #   trace_source["agent_id"] = f"{created_agent.name}:{created_agent.version}"
+                if smart_filter:
+                    trace_source["filter_strategy"] = "smart_filtering"
+
+                data_source = {
+                    "type": "azure_ai_trace_data_source_preview",
+                    "trace_source": trace_source,
+                }
+
+                print(
+                    f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) for agent "
+                    f"`{created_agent.name}` v{created_agent.version} "
+                    f"(window: {seed_start_unix}..{end_time_unix}, max_traces={effective_max_traces}"
+                    f"{', smart_filtering' if smart_filter else ''})."
+                )
+                eval_run = client.evals.runs.create(
+                    eval_id=eval_object.id,
+                    name=f"agent-smart-filter-{run_id}-a{attempt}",
+                    data_source=data_source,  # type: ignore
+                )
+                print(f"Eval run created (id: {eval_run.id}).")
+
+                print("Poll eval run until terminal.", end="", flush=True)
+                while True:
+                    run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
+                    if run.status in TERMINAL_STATUSES:
+                        break
+                    time.sleep(POLL_INTERVAL_SECONDS)
+                    print(".", end="", flush=True)
+                print()
+                print(f"Final run status: `{run.status}`.")
+
+                if run.status == "completed":
+                    output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
+                    if output_items:
+                        print(f"Run produced {len(output_items)} output item(s).")
+                        print(f"Result counts: {run.result_counts}")
+                        print(f"{'-' * 60}")
+                        pprint(output_items)
+                        print(f"{'-' * 60}")
+                        print(f"Eval run report URL: {run.report_url}")
+                        break
+                    print(
+                        f"Run completed but produced 0 output items "
+                        f"(result counts: {run.result_counts}); traces likely not yet ingested."
+                    )
+                else:
+                    print(f"Run did not complete (status: `{run.status}`, error: {run.error}).")
+
+                if attempt == MAX_EVAL_ATTEMPTS:
+                    raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.")
+                print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True)
+                time.sleep(RETRY_WAIT_SECONDS)
+
+        finally:
+            # Best-effort cleanup: eval object -> seeded conversations -> agent.
+            if eval_object is not None:
+                try:
+                    client.evals.delete(eval_id=eval_object.id)
+                    print(f"Deleted evaluation `{eval_object.id}`.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete evaluation: {exc}")
+
+            for cid in created_conversation_ids:
+                try:
+                    client.conversations.delete(conversation_id=cid)
+                    print(f"Deleted seeded conversation `{cid}`.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete conversation `{cid}`: {exc}")
+
+            if created_agent is not None:
+                try:
+                    project_client.agents.delete_version(
+                        agent_name=created_agent.name,
+                        agent_version=created_agent.version,
+                    )
+                    print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete agent: {exc}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py
index a5e88ab53301..a2a7d9ec3cc2 100644
--- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py
+++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_evaluations_builtin_with_traces.py
@@ -1,4 +1,4 @@
-# pylint: disable=line-too-long,useless-suppression,docstring-missing-param,docstring-missing-return,docstring-missing-rtype,unused-argument
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
@@ -6,75 +6,98 @@
 
 """
 DESCRIPTION:
-    Given an AIProjectClient, this sample demonstrates how to run Azure AI Evaluations
-    against agent traces collected in Azure Application Insights.
-
-    Supports three modes:
-      - Default mode (no flags): Queries Application Insights client-side for trace IDs
-        using the AGENT_ID environment variable, then passes them to the eval service.
-      - Agent ID mode (--agent-id): Passes the agent ID directly to the eval service,
-        which resolves traces server-side from Application Insights.
-      - Trace ID mode (--trace-ids): Passes explicit trace IDs to the eval service.
+    Self-contained sample that runs Azure AI built-in evaluators against agent
+    traces resolved server-side by `agent_id`.
+
+    Steps:
+      1. Creates a transient agent.
+      2. Seeds a few single-turn prompts so the service emits traces into
+         Application Insights.
+      3. Creates a trace-based evaluation group with single-turn evaluators.
+      4. Submits an evaluation run that uses the `azure_ai_traces` data source
+         with `agent_id="<name>:<version>"`; the service resolves traces
+         server-side. Retries the run if Application Insights ingestion is
+         still in flight.
+      5. Cleans up the evaluation, seeded conversations, and agent.
+
+    Prerequisite: the project must have an Application Insights resource
+    connected so the agent emits server-side traces. No `APPINSIGHTS_RESOURCE_ID`
+    or `AGENT_ID` env vars are required - everything is self-contained.
 
 USAGE:
     python sample_evaluations_builtin_with_traces.py
-    python sample_evaluations_builtin_with_traces.py --agent-id "my-agent:1"
-    python sample_evaluations_builtin_with_traces.py --trace-ids abc123 def456
-    python sample_evaluations_builtin_with_traces.py --agent-id "my-agent:1" --lookback-hours 48 --max-traces 20
-    python sample_evaluations_builtin_with_traces.py --no-cleanup
+    python sample_evaluations_builtin_with_traces.py --max-traces 10
+    python sample_evaluations_builtin_with_traces.py --lookback-hours 2
 
     Before running the sample:
 
-    pip install "azure-ai-projects>=2.0.0" python-dotenv azure-monitor-query
+    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv
 
     Set these environment variables with your own values:
-    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your
-       Microsoft Foundry project. It has the form: https://<account_name>.services.ai.azure.com/api/projects/<project_name>.
-    2) APPINSIGHTS_RESOURCE_ID - Required (for default mode). The Azure Application Insights resource ID that stores
-       agent traces. Not needed when using --agent-id or --trace-ids.
-       It has the form: /subscriptions/<subscription_id>/resourceGroups/<rg_name>/providers/Microsoft.Insights/components/<resource_name>.
-    3) AGENT_ID - Required. The agent identifier emitted by the Azure tracing integration, used to filter traces.
-    4) FOUNDRY_MODEL_NAME - Required. The Azure OpenAI deployment name to use with the built-in evaluators.
-    5) TRACE_LOOKBACK_HOURS - Optional. Number of hours to look back when querying traces and in the evaluation run.
-       Defaults to 1.
+    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
+       found in the overview page of your Microsoft Foundry project.
+    2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to
+       drive the agent during trace seeding and to power the AI-assisted
+       evaluators.
 """
 
 import argparse
 import os
 import time
-from datetime import datetime, timedelta, timezone
+import uuid
+from datetime import datetime, timezone
 from pprint import pprint
-from typing import Any, Dict, List, Optional
+from typing import List
+
 from dotenv import load_dotenv
+
 from azure.identity import DefaultAzureCredential
-from azure.monitor.query import LogsQueryClient, LogsQueryStatus
 from azure.ai.projects import AIProjectClient
-from azure.ai.projects.models import (
-    TestingCriterionAzureAIEvaluator,
-)
+from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator
 
 load_dotenv()
 
 
+AGENT_INSTRUCTIONS = (
+    "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the "
+    "customer's issue when possible. Policies you can quote:\n"
+    " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n"
+    " - Exchanges: same window as refunds; exchanges do not include store credit.\n"
+    " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n"
+    " - You cannot place orders or process refunds directly; direct the customer to the website "
+    "   or store. Always close with a confirmation that the customer's question is answered."
+)
+# Single-turn prompts: each prompt is seeded as its own one-turn conversation so
+# the service emits one trace span per item.
+SINGLE_TURN_PROMPTS: List[str] = [
+    "I bought a widget last week and it stopped working - what are my options?",
+    "What is the return window for unopened widgets?",
+    "Can I get store credit if I exchange a defective gizmo?",
+    "How much does shipping cost for a small replacement part?",
+    "How long does a refund take to show up on my card?",
+]
+
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
-appinsights_resource_id = os.environ[
-    "APPINSIGHTS_RESOURCE_ID"
-]  # Sample : /subscriptions/<subscription_id>/resourceGroups/<rg_name>/providers/Microsoft.Insights/components/<resource_name>
-agent_id = os.environ["AGENT_ID"]
 model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"]
-default_lookback_hours = int(os.environ.get("TRACE_LOOKBACK_HOURS", "1"))
 
+POLL_INTERVAL_SECONDS = 5
+INITIAL_INGEST_WAIT_SECONDS = 60
+MAX_EVAL_ATTEMPTS = 5
+RETRY_WAIT_SECONDS = 60
+
+TERMINAL_STATUSES = {"completed", "failed", "canceled"}
 
-def _build_evaluator_config(name: str, evaluator_name: str) -> TestingCriterionAzureAIEvaluator:
-    """Create a standard Azure AI evaluator configuration block for trace evaluations."""
+
+def _build_evaluator(name: str, evaluator_name: str) -> TestingCriterionAzureAIEvaluator:
+    """Standard single-turn evaluator config for the `azure_ai_traces` data source."""
     return TestingCriterionAzureAIEvaluator(
         type="azure_ai_evaluator",
         name=name,
         evaluator_name=evaluator_name,
         data_mapping={
-            "query": "{{sample.query}}",
-            "response": "{{sample.response}}",
-            "tool_definitions": "{{sample.tool_definitions}}",
+            "query": "{{item.query}}",
+            "response": "{{item.response}}",
+            "tool_definitions": "{{item.tool_definitions}}",
         },
         initialization_parameters={
             "deployment_name": model_deployment_name,
@@ -82,181 +105,163 @@ def _build_evaluator_config(name: str, evaluator_name: str) -> TestingCriterionA
     )
 
 
-def get_trace_ids(
-    appinsight_resource_id: str, tracked_agent_id: str, start_time: datetime, end_time: datetime
-) -> List[str]:
-    """
-    Query Application Insights for trace IDs (operation_Id) based on agent ID and time range.
-
-    Args:
-        appinsight_resource_id: The resource ID of the Application Insights instance.
-        tracked_agent_id: The agent ID to filter by.
-        start_time: Start time for the query.
-        end_time: End time for the query.
-
-    Returns:
-        List of distinct operation IDs (trace IDs).
-    """
-    query = """
-dependencies
-| where timestamp between (datetime({start_time.isoformat()}) .. datetime({end_time.isoformat()}))
-| extend agent_id = tostring(customDimensions["gen_ai.agent.id"])
-| where agent_id == "{tracked_agent_id}"
-| distinct operation_Id
-"""
-
-    try:
-        with DefaultAzureCredential() as credential:
-            client = LogsQueryClient(credential)
-            response = client.query_resource(
-                appinsight_resource_id,
-                query=query,
-                timespan=None,  # Time range is specified in the query itself.
-            )
-    except Exception as exc:  # pylint: disable=broad-except
-        print(f"Error executing query: {exc}")
-        return []
-
-    if response.status == LogsQueryStatus.SUCCESS:
-        trace_ids: List[str] = []
-        for table in response.tables:
-            for row in table.rows:
-                trace_ids.append(row[0])
-        return trace_ids
-
-    print(f"Query failed with status: {response.status}")
-    if response.partial_error:
-        print(f"Partial error: {response.partial_error}")
-    return []
-
-
 def main() -> None:  # pylint: disable=too-many-statements
-    parser = argparse.ArgumentParser(description="Run Azure AI trace evaluations against agent traces.")
-    mode = parser.add_mutually_exclusive_group()
-    mode.add_argument("--agent-id", default=None, help="Agent ID for server-side trace resolution")
-    mode.add_argument("--trace-ids", nargs="+", default=None, help="Explicit trace IDs to evaluate")
-    parser.add_argument("--lookback-hours", type=int, default=None, help="Lookback window in hours")
-    parser.add_argument("--max-traces", type=int, default=50, help="Max traces in agent-id mode (default: 50)")
-    parser.add_argument("--no-cleanup", action="store_true", help="Keep eval group after run")
+    parser = argparse.ArgumentParser(
+        description="Run built-in trace evaluators against an agent's traces (self-contained)."
+    )
+    parser.add_argument(
+        "--max-traces",
+        type=int,
+        default=len(SINGLE_TURN_PROMPTS),
+        help=f"Max traces to evaluate (default: {len(SINGLE_TURN_PROMPTS)} = one per seeded prompt).",
+    )
+    parser.add_argument(
+        "--lookback-hours",
+        type=int,
+        default=1,
+        help="Hours to look back when resolving traces server-side (default: 1).",
+    )
     args = parser.parse_args()
 
-    lookback_hours = args.lookback_hours or default_lookback_hours
-    trace_ids: Optional[List[str]] = None
-    agent_id_for_server: Optional[str] = None
-    metadata: Dict[str, str] = {}
-
-    if args.agent_id:
-        agent_id_for_server = args.agent_id
-        print("Mode: Server-side agent ID resolution")
-        print(f"Agent ID: {args.agent_id}")
-        print(f"Lookback: {lookback_hours}h, Max traces: {args.max_traces}")
-        metadata["agent_id"] = args.agent_id
-
-    elif args.trace_ids:
-        trace_ids = list(args.trace_ids)
-        print(f"Mode: Explicit trace IDs ({len(trace_ids)} provided)")
-
-    else:
-        end_time = datetime.now(tz=timezone.utc)
-        start_time = end_time - timedelta(hours=lookback_hours)
-
-        print("Querying Application Insights for trace identifiers...")
-        print(f"Agent ID: {agent_id}")
-        print(f"Time range: {start_time.isoformat()} to {end_time.isoformat()}")
-
-        trace_ids = get_trace_ids(appinsights_resource_id, agent_id, start_time, end_time)
-
-        if not trace_ids:
-            print("No trace IDs found for the provided agent and time window.")
-            return
-
-        print(f"\nFound {len(trace_ids)} trace IDs:")
-        for tid in trace_ids:
-            print(f"  - {tid}")
-
-        metadata["agent_id"] = agent_id
-        metadata["start_time"] = start_time.isoformat()
-        metadata["end_time"] = end_time.isoformat()
-
-    with DefaultAzureCredential() as credential:
-        with AIProjectClient(endpoint=endpoint, credential=credential) as project_client:
-            client = project_client.get_openai_client()
-
+    run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
+    agent_name = f"builtin-traces-{run_id}"
+
+    with (
+        DefaultAzureCredential() as credential,
+        AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
+        project_client.get_openai_client() as client,
+    ):
+
+        created_agent = None
+        created_conversation_ids: List[str] = []
+        eval_object = None
+
+        try:
+            # 1. Create an agent that traces will be filtered to.
+            print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).")
+            created_agent = project_client.agents.create_version(
+                agent_name=agent_name,
+                definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS),
+            )
+            print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
+
+            # 2. Seed single-turn prompts so the service emits traces.
+            print(f"Seed {len(SINGLE_TURN_PROMPTS)} single-turn prompt(s) against the agent.")
+            for prompt in SINGLE_TURN_PROMPTS:
+                conversation = client.conversations.create()
+                created_conversation_ids.append(conversation.id)
+                print(f"  - conversation id: {conversation.id} (prompt: {prompt!r})")
+                client.responses.create(
+                    conversation=conversation.id,
+                    input=prompt,
+                    extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}},
+                )
+
+            print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True)
+            time.sleep(INITIAL_INGEST_WAIT_SECONDS)
+
+            # 3. Create the trace-based evaluation group (single-turn evaluators).
             data_source_config = {
                 "type": "azure_ai_source",
                 "scenario": "traces",
             }
 
             testing_criteria = [
-                _build_evaluator_config(
-                    name="intent_resolution",
-                    evaluator_name="builtin.intent_resolution",
-                ),
-                _build_evaluator_config(
-                    name="task_adherence",
-                    evaluator_name="builtin.task_adherence",
-                ),
+                _build_evaluator(name="intent_resolution", evaluator_name="builtin.intent_resolution"),
+                _build_evaluator(name="task_adherence", evaluator_name="builtin.task_adherence"),
             ]
 
-            print("\nCreating evaluation")
+            print("Create trace-based evaluation group.")
             eval_object = client.evals.create(
-                name="agent_trace_eval_group",
+                name=f"Builtin Trace Evaluation {run_id}",
                 data_source_config=data_source_config,  # type: ignore
                 testing_criteria=testing_criteria,  # type: ignore
             )
-            print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})")
-
-            print("\nGet Evaluation by Id")
-            eval_object_response = client.evals.retrieve(eval_object.id)
-            print("Evaluation Response:")
-            pprint(eval_object_response)
-
-            # Build data source based on mode
-            if agent_id_for_server:
-                data_source: Dict[str, Any] = {
+            print(f"Evaluation created (id: {eval_object.id}).")
+
+            # 4. Submit eval runs using the `azure_ai_traces` data source with
+            # agent_id set to "<name>:<version>"; the service resolves matching
+            # traces server-side from Application Insights.
+            agent_id_for_server = f"{created_agent.name}:{created_agent.version}"
+            run = None
+            for attempt in range(1, MAX_EVAL_ATTEMPTS + 1):
+                data_source = {
                     "type": "azure_ai_traces",
                     "agent_id": agent_id_for_server,
-                    "lookback_hours": lookback_hours,
+                    "lookback_hours": args.lookback_hours,
                     "max_traces": args.max_traces,
                 }
-            else:
-                assert trace_ids is not None
-                data_source = {
-                    "type": "azure_ai_traces",
-                    "trace_ids": trace_ids,
-                    "lookback_hours": lookback_hours,
-                }
 
-            print("\nCreating Eval Run")
-            run_name = f"agent_trace_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-            eval_run_object = client.evals.runs.create(
-                eval_id=eval_object.id,
-                name=run_name,
-                metadata=metadata if metadata else None,
-                data_source=data_source,  # type: ignore
-            )
-            print("Eval Run created")
-            pprint(eval_run_object)
-
-            print("\nMonitoring Eval Run status...")
-            while True:
-                run = client.evals.runs.retrieve(run_id=eval_run_object.id, eval_id=eval_object.id)
-                print(f"Status: {run.status}")
-
-                if run.status in {"completed", "failed", "canceled"}:
-                    print("\nEval Run finished!")
-                    print("Final Eval Run Response:")
-                    pprint(run)
-                    break
-
-                time.sleep(5)
-                print("Waiting for eval run to complete...")
-
-            if not args.no_cleanup:
-                client.evals.delete(eval_id=eval_object.id)
-                print("Evaluation deleted")
-            else:
-                print(f"Skipping cleanup (--no-cleanup). Eval ID: {eval_object.id}")
+                print(
+                    f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) for agent_id "
+                    f"`{agent_id_for_server}` (lookback_hours={args.lookback_hours}, "
+                    f"max_traces={args.max_traces})."
+                )
+                eval_run = client.evals.runs.create(
+                    eval_id=eval_object.id,
+                    name=f"builtin-traces-{run_id}-a{attempt}",
+                    metadata={"agent_id": agent_id_for_server},
+                    data_source=data_source,  # type: ignore
+                )
+                print(f"Eval run created (id: {eval_run.id}).")
+
+                print("Poll eval run until terminal.", end="", flush=True)
+                while True:
+                    run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
+                    if run.status in TERMINAL_STATUSES:
+                        break
+                    time.sleep(POLL_INTERVAL_SECONDS)
+                    print(".", end="", flush=True)
+                print()
+                print(f"Final run status: `{run.status}`.")
+
+                if run.status == "completed":
+                    output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
+                    if output_items:
+                        print(f"Run produced {len(output_items)} output item(s).")
+                        print(f"Result counts: {run.result_counts}")
+                        print(f"{'-' * 60}")
+                        pprint(output_items)
+                        print(f"{'-' * 60}")
+                        print(f"Eval run report URL: {run.report_url}")
+                        break
+                    print(
+                        f"Run completed but produced 0 output items "
+                        f"(result counts: {run.result_counts}); traces likely not yet ingested."
+                    )
+                else:
+                    print(f"Run did not complete (status: `{run.status}`, error: {run.error}).")
+
+                if attempt == MAX_EVAL_ATTEMPTS:
+                    raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.")
+                print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True)
+                time.sleep(RETRY_WAIT_SECONDS)
+
+        finally:
+            # Best-effort cleanup: eval object -> seeded conversations -> agent.
+            if eval_object is not None:
+                try:
+                    client.evals.delete(eval_id=eval_object.id)
+                    print(f"Deleted evaluation `{eval_object.id}`.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete evaluation: {exc}")
+
+            for cid in created_conversation_ids:
+                try:
+                    client.conversations.delete(conversation_id=cid)
+                    print(f"Deleted seeded conversation `{cid}`.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete conversation `{cid}`: {exc}")
+
+            if created_agent is not None:
+                try:
+                    project_client.agents.delete_version(
+                        agent_name=created_agent.name,
+                        agent_version=created_agent.version,
+                    )
+                    print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete agent: {exc}")
 
 
 if __name__ == "__main__":
diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py
index 7a65c9f6b4ba..bd78bff71e23 100644
--- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py
+++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_agent_filter.py
@@ -6,177 +6,299 @@
 
 """
 DESCRIPTION:
-    Given an AIProjectClient, this sample demonstrates how to evaluate multi-turn
-    agent conversations by filtering traces from Application Insights using an
-    agent name/version or agent ID, with optional smart filtering.
+    Self-contained sample that evaluates multi-turn agent conversations by
+    filtering Application Insights traces for a specific agent over a time
+    window.
 
-    This is Scenario 3 of multi-turn evaluations: instead of providing specific
-    conversation or trace IDs, you specify an agent identity and a time window.
-    The service samples traces from App Insights matching that agent and evaluates
-    the reconstructed conversations.
+    Steps:
+      1. Creates a transient agent.
+      2. Seeds a few multi-turn conversations against the agent so the service
+         emits traces into Application Insights.
+      3. Creates a trace-based evaluation group with conversation-level
+         evaluators.
+      4. Submits an evaluation run with `agent_filter` (agent_name +
+         agent_version, time window narrowed to the seeding interval).
+         Retries the run if Application Insights ingestion is still in flight.
+      5. Cleans up the evaluation, seeded conversations, and agent.
 
-    Three agent filter forms are supported:
-      - agent_name + agent_version: Specify the agent by name and version separately.
-      - agent_id: Specify the agent as a single "name:version" string.
-      - smart_filtering: Use filter_strategy="smart_filtering" to bias trace
-        selection toward more interesting conversations.
+    Prerequisite: the project must have an Application Insights resource
+    connected so the agent emits server-side traces.
+
+    The `agent_filter` shape also supports:
+      - `agent_id`: a single "name:version" string (see comment in code).
+      - `filter_strategy="smart_filtering"`: biases trace selection toward more
+        interesting conversations (enabled via --smart-filter).
 
 USAGE:
     python sample_multiturn_trace_evaluation_agent_filter.py
-    python sample_multiturn_trace_evaluation_agent_filter.py --agent-id "my-agent:1"
     python sample_multiturn_trace_evaluation_agent_filter.py --smart-filter
+    python sample_multiturn_trace_evaluation_agent_filter.py --max-traces 5
 
     Before running the sample:
 
-    pip install "azure-ai-projects>=2.0.0" python-dotenv
+    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv
 
     Set these environment variables with your own values:
-    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint.
-    2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators.
-    3) FOUNDRY_AGENT_NAME - Required. The name of the agent whose traces to evaluate.
-    4) FOUNDRY_AGENT_VERSION - Optional. The agent version. If not set, latest is used.
+    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
+       found in the overview page of your Microsoft Foundry project.
+    2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to
+       drive the agent during trace seeding and to power the AI-assisted
+       evaluators.
 """
 
 import argparse
 import os
 import time
+import uuid
+from datetime import datetime, timezone
 from pprint import pprint
+from typing import List
+
 from dotenv import load_dotenv
+
 from azure.identity import DefaultAzureCredential
 from azure.ai.projects import AIProjectClient
-from azure.ai.projects.models import TestingCriterionAzureAIEvaluator
+from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator
 
 load_dotenv()
 
+
+AGENT_INSTRUCTIONS = (
+    "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the "
+    "customer's issue when possible. Policies you can quote:\n"
+    " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n"
+    " - Exchanges: same window as refunds; exchanges do not include store credit.\n"
+    " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n"
+    " - You cannot place orders or process refunds directly; direct the customer to the website "
+    "   or store. Always close with a confirmation that the customer's question is answered."
+)
+CONVERSATION_FLOWS: List[List[str]] = [
+    [
+        "I bought a widget last week and it stopped working.",
+        "It is past the 30 day mark, can I still return it?",
+        "How long will the refund take to process?",
+        "Thanks, that answers my question.",
+    ],
+    [
+        "Do you sell replacement parts for gizmos?",
+        "How much does shipping cost for a small part?",
+        "Got it, I will order it from the website. Thank you.",
+    ],
+    [
+        "What is the difference between an exchange and a refund?",
+        "If I exchange a defective gizmo, do I also get store credit?",
+        "Understood, thanks for clarifying.",
+    ],
+]
+
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
 model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"]
-agent_name = os.environ["FOUNDRY_AGENT_NAME"]
-agent_version = os.environ.get("FOUNDRY_AGENT_VERSION", "")
+
+POLL_INTERVAL_SECONDS = 5
+INITIAL_INGEST_WAIT_SECONDS = 60
+MAX_EVAL_ATTEMPTS = 5
+RETRY_WAIT_SECONDS = 60
+# Service constraints for agent_filter trace_source:
+#   - end_time - start_time must be >= 15 minutes.
+#   - conversation-level queries exclude conversations whose first/last span is
+#     within 5 minutes of either window edge, so we need >5 min of padding on
+#     each side of the actual seeding window.
+MIN_AGENT_FILTER_WINDOW_SECONDS = 16 * 60
+AGENT_FILTER_EDGE_BUFFER_SECONDS = 6 * 60
+
+TERMINAL_STATUSES = {"completed", "failed", "canceled"}
 
 
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Evaluate agent traces using agent filter.")
-    parser.add_argument("--agent-id", default=None, help='Agent ID in "name:version" format')
+def main() -> None:  # pylint: disable=too-many-statements
+    parser = argparse.ArgumentParser(description="Evaluate agent traces using agent_filter (self-contained).")
     parser.add_argument("--smart-filter", action="store_true", help="Use smart_filtering strategy")
-    parser.add_argument("--max-traces", type=int, default=5, help="Max traces to evaluate (default: 5)")
-    parser.add_argument("--lookback-hours", type=int, default=24, help="Hours to look back (default: 24)")
+    parser.add_argument(
+        "--max-traces",
+        type=int,
+        default=len(CONVERSATION_FLOWS),
+        help=f"Max traces to evaluate (default: {len(CONVERSATION_FLOWS)} = one per seeded conversation)",
+    )
     args = parser.parse_args()
 
+    run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
+    agent_name = f"mt-trace-agent-filter-{run_id}"
+
     with (
         DefaultAzureCredential() as credential,
         AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
         project_client.get_openai_client() as client,
     ):
-        # Eval group for trace-based evaluations
-        data_source_config = {
-            "type": "azure_ai_source",
-            "scenario": "traces",
-        }
-
-        testing_criteria = [
-            TestingCriterionAzureAIEvaluator(
-                type="azure_ai_evaluator",
-                name="customer_satisfaction",
-                evaluator_name="builtin.customer_satisfaction",
-                initialization_parameters={"model": model_deployment_name},
-                data_mapping={"messages": "{{item.messages}}"},
-            ),
-            TestingCriterionAzureAIEvaluator(
-                type="azure_ai_evaluator",
-                name="task_completion",
-                evaluator_name="builtin.task_completion",
-                initialization_parameters={"model": model_deployment_name},
-                data_mapping={"messages": "{{item.messages}}"},
-            ),
-            TestingCriterionAzureAIEvaluator(
-                type="azure_ai_evaluator",
-                name="conversation_coherence",
-                evaluator_name="builtin.coherence",
-                initialization_parameters={"model": model_deployment_name},
-                data_mapping={"messages": "{{item.messages}}"},
-            ),
-            TestingCriterionAzureAIEvaluator(
-                type="azure_ai_evaluator",
-                name="groundedness",
-                evaluator_name="builtin.groundedness",
-                initialization_parameters={"model": model_deployment_name},
-                data_mapping={"messages": "{{item.messages}}"},
-            ),
-        ]
-
-        print("Creating trace-based evaluation group")
-        eval_object = client.evals.create(
-            name="Multi-turn Trace Evaluation (Agent Filter)",
-            data_source_config=data_source_config,  # type: ignore
-            testing_criteria=testing_criteria,
-        )
-        print(f"Evaluation created (id: {eval_object.id})")
-
-        # Compute time window in unix seconds
-        # Pad end_time by +600s (10 min) to avoid ingestion-delay edge exclusion
-        now_unix = int(time.time())
-        end_time = now_unix + 600
-        start_time = now_unix - (args.lookback_hours * 3600)
-
-        # Build trace_source based on mode
-        trace_source: dict = {
-            "type": "agent_filter",
-            "start_time": start_time,
-            "end_time": end_time,
-            "max_traces": args.max_traces,
-        }
-
-        if args.agent_id:
-            # agent_id form: single "name:version" string
-            trace_source["agent_id"] = args.agent_id
-            print(f"Using agent_id filter: {args.agent_id}")
-        else:
-            # agent_name + agent_version form
-            trace_source["agent_name"] = agent_name
-            if agent_version:
-                trace_source["agent_version"] = agent_version
-            print(f"Using agent filter: {agent_name} v{agent_version or '(latest)'}")
-
-        if args.smart_filter:
-            trace_source["filter_strategy"] = "smart_filtering"
-            print("Filter strategy: smart_filtering")
-
-        data_source = {
-            "type": "azure_ai_trace_data_source_preview",
-            "trace_source": trace_source,
-        }
-
-        eval_run = client.evals.runs.create(
-            eval_id=eval_object.id,
-            name="multiturn-agent-filter-run",
-            data_source=data_source,  # type: ignore
-            extra_body={"evaluation_level": "conversation"},
-        )
-        print(f"Evaluation run created (id: {eval_run.id})")
-
-        while True:
-            run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
-            if run.status in ("completed", "failed"):
-                break
-            print(f"Waiting for eval run to complete... current status: {run.status}")
-            time.sleep(5)
-
-        if run.status == "completed":
-            print("\n✓ Evaluation run completed successfully!")
-            print(f"Result Counts: {run.result_counts}")
-
-            output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
-            print(f"\nOUTPUT ITEMS (Total: {len(output_items)})")
-            print(f"{'-'*60}")
-            pprint(output_items)
-            print(f"{'-'*60}")
-
-            print(f"\nEval Run Report URL: {run.report_url}")
-        else:
-            print(f"\n✗ Evaluation run failed: {run.error}")
-
-        client.evals.delete(eval_id=eval_object.id)
-        print("Evaluation deleted")
+
+        created_agent = None
+        created_conversation_ids: List[str] = []
+        eval_object = None
+
+        try:
+            # 1. Create an agent that traces will be filtered to.
+            print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).")
+            created_agent = project_client.agents.create_version(
+                agent_name=agent_name,
+                definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS),
+            )
+            print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
+
+            # 2. Seed multi-turn conversations and capture the seeding window.
+            # Pre-seed buffer must exceed the service's 5-min edge exclusion for
+            # conversation-level queries.
+            seed_start_unix = int(time.time()) - AGENT_FILTER_EDGE_BUFFER_SECONDS
+            print(f"Seed {len(CONVERSATION_FLOWS)} multi-turn conversation(s) against the agent.")
+            for flow in CONVERSATION_FLOWS:
+                conversation = client.conversations.create()
+                created_conversation_ids.append(conversation.id)
+                print(f"  - conversation id: {conversation.id} ({len(flow)} turn(s))")
+                for turn in flow:
+                    client.responses.create(
+                        conversation=conversation.id,
+                        input=turn,
+                        extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}},
+                    )
+
+            print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True)
+            time.sleep(INITIAL_INGEST_WAIT_SECONDS)
+
+            # 3. Create the trace-based evaluation group (conversation-level evaluators).
+            data_source_config = {
+                "type": "azure_ai_source",
+                "scenario": "traces",
+            }
+
+            testing_criteria = [
+                TestingCriterionAzureAIEvaluator(
+                    type="azure_ai_evaluator",
+                    name="customer_satisfaction",
+                    evaluator_name="builtin.customer_satisfaction",
+                    initialization_parameters={"model": model_deployment_name},
+                    data_mapping={"messages": "{{item.messages}}"},
+                ),
+                TestingCriterionAzureAIEvaluator(
+                    type="azure_ai_evaluator",
+                    name="task_completion",
+                    evaluator_name="builtin.task_completion",
+                    initialization_parameters={"model": model_deployment_name},
+                    data_mapping={"messages": "{{item.messages}}"},
+                ),
+                TestingCriterionAzureAIEvaluator(
+                    type="azure_ai_evaluator",
+                    name="conversation_coherence",
+                    evaluator_name="builtin.coherence",
+                    initialization_parameters={"model": model_deployment_name},
+                    data_mapping={"messages": "{{item.messages}}"},
+                ),
+            ]
+
+            print("Create trace-based evaluation group.")
+            eval_object = client.evals.create(
+                name=f"Multi-turn Trace Evaluation (Agent Filter) {run_id}",
+                data_source_config=data_source_config,  # type: ignore
+                testing_criteria=testing_criteria,
+            )
+            print(f"Evaluation created (id: {eval_object.id}).")
+
+            # 4. Submit eval runs with agent_filter narrowed to the seeding window.
+            # Pad end_time so the last seeded span is >5 min from the upper edge
+            # (conversation-level edge exclusion) and enforce the service-side
+            # 15-min minimum window.
+            run = None
+            for attempt in range(1, MAX_EVAL_ATTEMPTS + 1):
+                end_time_unix = max(
+                    int(time.time()) + AGENT_FILTER_EDGE_BUFFER_SECONDS,
+                    seed_start_unix + MIN_AGENT_FILTER_WINDOW_SECONDS,
+                )
+
+                trace_source = {
+                    "type": "agent_filter",
+                    "agent_name": created_agent.name,
+                    "agent_version": str(created_agent.version),
+                    "start_time": seed_start_unix,
+                    "end_time": end_time_unix,
+                    "max_traces": args.max_traces,
+                }
+                # Alternative shape: pass a single "name:version" string via `agent_id`:
+                #   trace_source["agent_id"] = f"{created_agent.name}:{created_agent.version}"
+                if args.smart_filter:
+                    trace_source["filter_strategy"] = "smart_filtering"
+
+                data_source = {
+                    "type": "azure_ai_trace_data_source_preview",
+                    "trace_source": trace_source,
+                }
+
+                print(
+                    f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) for agent "
+                    f"`{created_agent.name}` v{created_agent.version} "
+                    f"(window: {seed_start_unix}..{end_time_unix}, max_traces={args.max_traces}"
+                    f"{', smart_filtering' if args.smart_filter else ''})."
+                )
+                eval_run = client.evals.runs.create(
+                    eval_id=eval_object.id,
+                    name=f"multiturn-agent-filter-{run_id}-a{attempt}",
+                    data_source=data_source,  # type: ignore
+                    extra_body={"evaluation_level": "conversation"},
+                )
+                print(f"Eval run created (id: {eval_run.id}).")
+
+                print("Poll eval run until terminal.", end="", flush=True)
+                while True:
+                    run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
+                    if run.status in TERMINAL_STATUSES:
+                        break
+                    time.sleep(POLL_INTERVAL_SECONDS)
+                    print(".", end="", flush=True)
+                print()
+                print(f"Final run status: `{run.status}`.")
+
+                if run.status == "completed":
+                    output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
+                    if output_items:
+                        print(f"Run produced {len(output_items)} output item(s).")
+                        print(f"Result counts: {run.result_counts}")
+                        print(f"{'-' * 60}")
+                        pprint(output_items)
+                        print(f"{'-' * 60}")
+                        print(f"Eval run report URL: {run.report_url}")
+                        break
+                    print(
+                        f"Run completed but produced 0 output items "
+                        f"(result counts: {run.result_counts}); traces likely not yet ingested."
+                    )
+                else:
+                    print(f"Run did not complete (status: `{run.status}`, error: {run.error}).")
+
+                if attempt == MAX_EVAL_ATTEMPTS:
+                    raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.")
+                print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True)
+                time.sleep(RETRY_WAIT_SECONDS)
+
+        finally:
+            # Best-effort cleanup: eval object -> seeded conversations -> agent.
+            if eval_object is not None:
+                try:
+                    client.evals.delete(eval_id=eval_object.id)
+                    print(f"Deleted evaluation `{eval_object.id}`.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete evaluation: {exc}")
+
+            for cid in created_conversation_ids:
+                try:
+                    client.conversations.delete(conversation_id=cid)
+                    print(f"Deleted seeded conversation `{cid}`.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete conversation `{cid}`: {exc}")
+
+            if created_agent is not None:
+                try:
+                    project_client.agents.delete_version(
+                        agent_name=created_agent.name,
+                        agent_version=created_agent.version,
+                    )
+                    print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.")
+                except Exception as exc:  # pylint: disable=broad-exception-caught
+                    print(f"  (warning) could not delete agent: {exc}")
 
 
 if __name__ == "__main__":
diff --git a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py
index f8117be7ae3b..355d2f89b2d8 100644
--- a/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py
+++ b/sdk/ai/azure-ai-projects/samples/evaluations/sample_multiturn_trace_evaluation_by_id.py
@@ -6,161 +6,263 @@
 
 """
 DESCRIPTION:
-    Given an AIProjectClient, this sample demonstrates how to evaluate multi-turn
-    conversations captured as agent traces in Application Insights, using specific
-    conversation IDs or trace IDs to select which conversations to evaluate.
+    Self-contained sample that evaluates multi-turn agent conversations captured
+    as agent traces in Application Insights, selecting them by Foundry
+    conversation IDs.
 
-    This is Scenario 2 of multi-turn evaluations: you provide known conversation
-    or trace identifiers, and the service reconstructs the messages from App Insights
-    traces, then runs conversation-level evaluators against them.
+    Steps:
+      1. Creates a transient agent.
+      2. Seeds a few multi-turn conversations against the agent so that the
+         service emits traces into Application Insights.
+      3. Creates a trace-based evaluation group with conversation-level
+         evaluators.
+      4. Submits an evaluation run that targets the seeded conversations by
+         `conversation_id_source`. Retries the run if the traces have not
+         finished ingesting into App Insights yet.
+      5. Cleans up the evaluation, seeded conversations, and agent.
 
-    Two modes are supported:
-      - conversation_id_source: Provide Foundry conversation IDs.
-      - trace_id_source: Provide W3C trace IDs (operation_Id from App Insights).
+    Prerequisite: the project must have an Application Insights resource
+    connected so the agent emits server-side traces.
+
+    Two `trace_source` shapes are supported by the service:
+      - `conversation_id_source` - the Foundry conversation IDs returned by
+        `openai_client.conversations.create()` (used here).
+      - `trace_id_source` - W3C trace IDs (`operation_Id` from App Insights);
+        see the commented snippet below for the alternative shape.
 
 USAGE:
     python sample_multiturn_trace_evaluation_by_id.py
 
     Before running the sample:
 
-    pip install "azure-ai-projects>=2.0.0" python-dotenv
+    pip install "azure-ai-projects>=2.2.0" azure-identity python-dotenv
 
     Set these environment variables with your own values:
-    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint.
-    2) FOUNDRY_MODEL_NAME - Required. The model deployment name for AI-assisted evaluators.
-    3) FOUNDRY_CONVERSATION_IDS - Required (for conversation_id mode). Comma-separated
-       Foundry conversation IDs to evaluate.
-       Example: "conv_abc123,conv_def456,conv_ghi789"
-    4) FOUNDRY_TRACE_IDS - Optional (for trace_id mode). Comma-separated W3C trace IDs.
-       If set, overrides conversation IDs.
+    1) FOUNDRY_PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as
+       found in the overview page of your Microsoft Foundry project.
+    2) FOUNDRY_MODEL_NAME - Required. The model deployment name used both to
+       drive the agent during trace seeding and to power the AI-assisted
+       evaluators.
 """
 
 import os
 import time
+import uuid
+from datetime import datetime, timezone
 from pprint import pprint
+from typing import List
+
 from dotenv import load_dotenv
+
 from azure.identity import DefaultAzureCredential
 from azure.ai.projects import AIProjectClient
-from azure.ai.projects.models import TestingCriterionAzureAIEvaluator
+from azure.ai.projects.models import PromptAgentDefinition, TestingCriterionAzureAIEvaluator
 
 load_dotenv()
 
+
+AGENT_INSTRUCTIONS = (
+    "Widgets & Gizmos support agent. Be concise, empathetic, and resolve the "
+    "customer's issue when possible. Policies you can quote:\n"
+    " - Refunds: unopened 30 days; defective up to 90 days; refunds take 5-7 business days.\n"
+    " - Exchanges: same window as refunds; exchanges do not include store credit.\n"
+    " - Replacement parts: available for gizmos; flat $4.99 shipping for small parts.\n"
+    " - You cannot place orders or process refunds directly; direct the customer to the website "
+    "   or store. Always close with a confirmation that the customer's question is answered."
+)
+# Each entry is one conversation. Multi-turn conversations exercise the
+# conversation-level evaluators (task_completion, customer_satisfaction, ...).
+# The final user turn closes the conversation so task_completion can recognize
+# the agent reached a resolution.
+CONVERSATION_FLOWS: List[List[str]] = [
+    [
+        "I bought a widget last week and it stopped working.",
+        "It is past the 30 day mark, can I still return it?",
+        "How long will the refund take to process?",
+        "Thanks, that answers my question.",
+    ],
+    [
+        "Do you sell replacement parts for gizmos?",
+        "How much does shipping cost for a small part?",
+        "Got it, I will order it from the website. Thank you.",
+    ],
+    [
+        "What is the difference between an exchange and a refund?",
+        "If I exchange a defective gizmo, do I also get store credit?",
+        "Understood, thanks for clarifying.",
+    ],
+]
+
 endpoint = os.environ["FOUNDRY_PROJECT_ENDPOINT"]
 model_deployment_name = os.environ["FOUNDRY_MODEL_NAME"]
 
-# Choose one: conversation IDs or trace IDs
-conversation_ids_str = os.environ.get("FOUNDRY_CONVERSATION_IDS", "")
-trace_ids_str = os.environ.get("FOUNDRY_TRACE_IDS", "")
+POLL_INTERVAL_SECONDS = 5
+INITIAL_INGEST_WAIT_SECONDS = 60
+MAX_EVAL_ATTEMPTS = 5
+RETRY_WAIT_SECONDS = 60
+
+# Per-run id keeps the agent name unique across repeated runs.
+run_id = f"{datetime.now(tz=timezone.utc).strftime('%y%m%d%H%M%S')}-{uuid.uuid4().hex[:4]}"
+agent_name = f"mt-trace-by-id-{run_id}"
+
+TERMINAL_STATUSES = {"completed", "failed", "canceled"}
+
 
 with (
     DefaultAzureCredential() as credential,
     AIProjectClient(endpoint=endpoint, credential=credential) as project_client,
     project_client.get_openai_client() as client,
 ):
-    # Eval group for trace-based evaluations uses azure_ai_source with scenario "traces"
-    data_source_config = {
-        "type": "azure_ai_source",
-        "scenario": "traces",
-    }
-
-    # Conversation-level evaluators for trace data
-    testing_criteria = [
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="customer_satisfaction",
-            evaluator_name="builtin.customer_satisfaction",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={"messages": "{{item.messages}}"},
-        ),
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="task_completion",
-            evaluator_name="builtin.task_completion",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={"messages": "{{item.messages}}"},
-        ),
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="conversation_coherence",
-            evaluator_name="builtin.coherence",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={"messages": "{{item.messages}}"},
-        ),
-        TestingCriterionAzureAIEvaluator(
-            type="azure_ai_evaluator",
-            name="groundedness",
-            evaluator_name="builtin.groundedness",
-            initialization_parameters={"model": model_deployment_name},
-            data_mapping={"messages": "{{item.messages}}"},
-        ),
-    ]
-
-    print("Creating trace-based evaluation group")
-    eval_object = client.evals.create(
-        name="Multi-turn Trace Evaluation (by ID)",
-        data_source_config=data_source_config,  # type: ignore
-        testing_criteria=testing_criteria,
-    )
-    print(f"Evaluation created (id: {eval_object.id})")
-
-    # Build the data source based on which IDs are provided
-    if trace_ids_str:
-        # Trace ID mode — provide W3C trace IDs (operation_Id from App Insights)
-        trace_ids = [tid.strip() for tid in trace_ids_str.split(",") if tid.strip()]
-        print(f"Using {len(trace_ids)} trace IDs")
-        data_source = {
-            "type": "azure_ai_trace_data_source_preview",
-            "trace_source": {
-                "type": "trace_id_source",
-                "trace_ids": trace_ids,
-            },
+
+    created_agent = None
+    created_conversation_ids: List[str] = []
+    eval_object = None
+
+    try:
+        # 1. Create an agent to attribute the seeded conversations to.
+        print(f"Create agent `{agent_name}` (model: `{model_deployment_name}`).")
+        created_agent = project_client.agents.create_version(
+            agent_name=agent_name,
+            definition=PromptAgentDefinition(model=model_deployment_name, instructions=AGENT_INSTRUCTIONS),
+        )
+        print(f"Agent created (id: {created_agent.id}, version: {created_agent.version}).")
+
+        # 2. Seed multi-turn conversations against the agent.
+        print(f"Seed {len(CONVERSATION_FLOWS)} multi-turn conversation(s) against the agent.")
+        for flow in CONVERSATION_FLOWS:
+            conversation = client.conversations.create()
+            created_conversation_ids.append(conversation.id)
+            print(f"  - conversation id: {conversation.id} ({len(flow)} turn(s))")
+            for turn in flow:
+                client.responses.create(
+                    conversation=conversation.id,
+                    input=turn,
+                    extra_body={"agent_reference": {"name": created_agent.name, "type": "agent_reference"}},
+                )
+
+        print(f"Wait {INITIAL_INGEST_WAIT_SECONDS}s for Application Insights to ingest the spans.", flush=True)
+        time.sleep(INITIAL_INGEST_WAIT_SECONDS)
+
+        # 3. Create the trace-based evaluation group (conversation-level evaluators).
+        data_source_config = {
+            "type": "azure_ai_source",
+            "scenario": "traces",
         }
-    else:
-        # Conversation ID mode — provide Foundry conversation IDs
-        conversation_ids = [cid.strip() for cid in conversation_ids_str.split(",") if cid.strip()]
-        if not conversation_ids:
-            raise ValueError(
-                "Set FOUNDRY_CONVERSATION_IDS or FOUNDRY_TRACE_IDS. "
-                "These are IDs from prior agent interactions captured in App Insights."
-            )
-        print(f"Using {len(conversation_ids)} conversation IDs")
+
+        testing_criteria = [
+            TestingCriterionAzureAIEvaluator(
+                type="azure_ai_evaluator",
+                name="customer_satisfaction",
+                evaluator_name="builtin.customer_satisfaction",
+                initialization_parameters={"model": model_deployment_name},
+                data_mapping={"messages": "{{item.messages}}"},
+            ),
+            TestingCriterionAzureAIEvaluator(
+                type="azure_ai_evaluator",
+                name="task_completion",
+                evaluator_name="builtin.task_completion",
+                initialization_parameters={"model": model_deployment_name},
+                data_mapping={"messages": "{{item.messages}}"},
+            ),
+            TestingCriterionAzureAIEvaluator(
+                type="azure_ai_evaluator",
+                name="conversation_coherence",
+                evaluator_name="builtin.coherence",
+                initialization_parameters={"model": model_deployment_name},
+                data_mapping={"messages": "{{item.messages}}"},
+            ),
+        ]
+
+        print("Create trace-based evaluation group.")
+        eval_object = client.evals.create(
+            name=f"Multi-turn Trace Evaluation (by ID) {run_id}",
+            data_source_config=data_source_config,  # type: ignore
+            testing_criteria=testing_criteria,
+        )
+        print(f"Evaluation created (id: {eval_object.id}).")
+
+        # 4. Submit an eval run that targets the seeded conversations by ID.
+        # Retry: ingestion delay can leave the conversations invisible to the
+        # eval service even after the initial wait.
         data_source = {
             "type": "azure_ai_trace_data_source_preview",
             "trace_source": {
                 "type": "conversation_id_source",
-                "conversation_ids": conversation_ids,
+                "conversation_ids": created_conversation_ids,
             },
         }
+        # Alternative shape (requires W3C trace IDs from App Insights):
+        #   "trace_source": {"type": "trace_id_source", "trace_ids": ["<operation_Id>", ...]}
+
+        run = None
+        for attempt in range(1, MAX_EVAL_ATTEMPTS + 1):
+            print(
+                f"Create eval run (attempt {attempt}/{MAX_EVAL_ATTEMPTS}) over "
+                f"{len(created_conversation_ids)} conversation id(s)."
+            )
+            eval_run = client.evals.runs.create(
+                eval_id=eval_object.id,
+                name=f"multiturn-trace-by-id-{run_id}-a{attempt}",
+                data_source=data_source,  # type: ignore
+                extra_body={"evaluation_level": "conversation"},
+            )
+            print(f"Eval run created (id: {eval_run.id}).")
+
+            print("Poll eval run until terminal.", end="", flush=True)
+            while True:
+                run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
+                if run.status in TERMINAL_STATUSES:
+                    break
+                time.sleep(POLL_INTERVAL_SECONDS)
+                print(".", end="", flush=True)
+            print()
+            print(f"Final run status: `{run.status}`.")
+
+            if run.status == "completed":
+                output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
+                expected_items = len(created_conversation_ids)
+                if len(output_items) >= expected_items:
+                    print(f"Run produced {len(output_items)} output item(s) (>= {expected_items} expected).")
+                    print(f"Result counts: {run.result_counts}")
+                    print(f"{'-' * 60}")
+                    pprint(output_items)
+                    print(f"{'-' * 60}")
+                    print(f"Eval run report URL: {run.report_url}")
+                    break
+                print(
+                    f"Run completed but produced {len(output_items)}/{expected_items} output items "
+                    f"(result counts: {run.result_counts}); traces likely not yet fully ingested."
+                )
+            else:
+                print(f"Run did not complete (status: `{run.status}`, error: {run.error}).")
+
+            if attempt == MAX_EVAL_ATTEMPTS:
+                raise RuntimeError(f"Eval run did not produce results after {MAX_EVAL_ATTEMPTS} attempts.")
+            print(f"Wait {RETRY_WAIT_SECONDS}s and retry.", flush=True)
+            time.sleep(RETRY_WAIT_SECONDS)
+
+    finally:
+        # Best-effort cleanup: eval object -> seeded conversations -> agent.
+        if eval_object is not None:
+            try:
+                client.evals.delete(eval_id=eval_object.id)
+                print(f"Deleted evaluation `{eval_object.id}`.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete evaluation: {exc}")
+
+        for cid in created_conversation_ids:
+            try:
+                client.conversations.delete(conversation_id=cid)
+                print(f"Deleted seeded conversation `{cid}`.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete conversation `{cid}`: {exc}")
 
-    # Create run with evaluation_level = "conversation"
-    eval_run = client.evals.runs.create(
-        eval_id=eval_object.id,
-        name="multiturn-trace-by-id-run",
-        data_source=data_source,  # type: ignore
-        extra_body={"evaluation_level": "conversation"},
-    )
-    print(f"Evaluation run created (id: {eval_run.id})")
-
-    while True:
-        run = client.evals.runs.retrieve(run_id=eval_run.id, eval_id=eval_object.id)
-        if run.status in ("completed", "failed"):
-            break
-        print(f"Waiting for eval run to complete... current status: {run.status}")
-        time.sleep(5)
-
-    if run.status == "completed":
-        print("\n✓ Evaluation run completed successfully!")
-        print(f"Result Counts: {run.result_counts}")
-
-        output_items = list(client.evals.runs.output_items.list(run_id=run.id, eval_id=eval_object.id))
-        print(f"\nOUTPUT ITEMS (Total: {len(output_items)})")
-        print(f"{'-'*60}")
-        pprint(output_items)
-        print(f"{'-'*60}")
-
-        print(f"\nEval Run Report URL: {run.report_url}")
-    else:
-        print(f"\n✗ Evaluation run failed: {run.error}")
-
-    client.evals.delete(eval_id=eval_object.id)
-    print("Evaluation deleted")
+        if created_agent is not None:
+            try:
+                project_client.agents.delete_version(
+                    agent_name=created_agent.name,
+                    agent_version=created_agent.version,
+                )
+                print(f"Deleted agent `{created_agent.name}` v{created_agent.version}.")
+            except Exception as exc:  # pylint: disable=broad-exception-caught
+                print(f"  (warning) could not delete agent: {exc}")
diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
index b311fa011822..4a5814a170f1 100644
--- a/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
+++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
@@ -123,8 +123,9 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
       get_bearer_token_provider() which is incompatible with mock credentials.
 
     External service dependencies (require additional Azure services):
-    - sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and
-      uses azure-monitor-query to fetch traces.
+    - sample_evaluations_builtin_with_traces.py: Seeds agent conversations and waits for
+      Application Insights to ingest the resulting traces before running the eval; the
+      real-time ingestion wait is not suitable for recorded playback.
     - sample_scheduled_evaluations.py: Requires Azure RBAC assignment via
       azure-mgmt-authorization and azure-mgmt-resource, AND uploads Dataset.
     - sample_human_evaluations.py: Requires Azure Application Insights (fetches
@@ -146,7 +147,7 @@ class TestSamplesEvaluations(AzureRecordedTestCase):
             samples_to_skip=[
                 "sample_evaluations_ai_assisted.py",  # Similarity evaluator returns FAILED_EXECUTION ('query' is missing)
                 "sample_evaluations_builtin_with_inline_data_oai.py",  # 401 AuthenticationError (invalid subscription key or API endpoint)
-                "sample_evaluations_builtin_with_traces.py",  # Missing required env var APPINSIGHTS_RESOURCE_ID (KeyError)
+                "sample_evaluations_builtin_with_traces.py",  # Self-contained sample seeds traces then waits for real App Insights ingestion; not suitable for playback
                 "sample_evaluations_score_model_grader_with_image.py",  # Eval fails: image inputs not supported for configured grader model
                 "sample_evaluations_score_model_grader_with_image_model_target.py",  # Eval fails: image inputs not supported for configured grader model
                 "sample_evaluations_score_model_grader_with_audio.py",  # Eval fails: audio inputs not supported for configured grader model