diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index e8c1b0810..82e54006a 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -1,842 +1,635 @@ -"""Tests for eval runtime span creation in _runtime.py. +"""Tests for eval runtime span creation. -Tests the spans added for eval tracing: -1. "Evaluation Set Run" - span_type: "eval_set_run" -2. "Evaluation" - span_type: "evaluation" -3. "Evaluator: {name}" - span_type: "evaluator" -4. "Evaluation output" - span_type: "evalOutput" +Verifies that running evaluations produces the correct OpenTelemetry span tree: + + "Evaluation Set Run" (span_type: "eval_set_run") + └── "Evaluation" (span_type: "evaluation") — one per eval item + └── "Evaluator: {name}" (span_type: "evaluator") — one per evaluator + └── "Evaluation output" (span.type: "evalOutput") — the score + +Every test runs the full pipeline via execute(). Only execute_runtime (the +actual agent invocation) is mocked — everything else runs for real. """ +import json import uuid -from typing import Any, Dict, List -from unittest.mock import MagicMock +from contextlib import contextmanager +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch import pytest -from opentelemetry.sdk.trace import Span +from pydantic import BaseModel from uipath.eval.evaluators import BaseEvaluator -from uipath.eval.models.evaluation_set import EvaluationItem -from uipath.eval.runtime.context import UiPathEvalContext +from uipath.eval.models import NumericEvaluationResult +from uipath.eval.models.evaluation_set import EvaluationItem, EvaluationSet +from uipath.eval.runtime import UiPathEvalContext, UiPathEvalRuntime +from uipath.runtime.schema import UiPathRuntimeSchema + +# --- Test infrastructure --- -class MockSpanContext: - """Mock span context manager for testing span creation.""" - def __init__(self, name: str, attributes: dict[str, Any] | None): +class MockSpan: + """Mock span that captures set_attribute and set_status calls.""" + + def __init__(self, name: str, attributes: dict[str, Any] | None = None): self.name = name - self.attributes = attributes or {} - self.span = MagicMock(spec=Span) - self.span.attributes = self.attributes + self.attributes = dict(attributes) if attributes else {} + self._status = None + + def set_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value - def __enter__(self): - return self.span + def set_status(self, status: Any) -> None: + self._status = status - def __exit__(self, *args): + def __enter__(self) -> "MockSpan": + return self + + def __exit__(self, *args: Any) -> None: pass class SpanCapturingTracer: - """A tracer that captures span creations for testing.""" + """Tracer that captures all created spans for verification.""" - def __init__(self): - self.created_spans: List[Dict[str, Any]] = [] + def __init__(self) -> None: + self.captured_spans: list[MockSpan] = [] + @contextmanager def start_as_current_span( self, name: str, attributes: dict[str, Any] | None = None ): - """Capture span creation and return a mock context manager.""" - span_info = {"name": name, "attributes": attributes or {}} - self.created_spans.append(span_info) - return MockSpanContext(name, attributes) - - -class TestEvalSetRunSpan: - """Tests for the 'Evaluation Set Run' span.""" - - def test_span_name_is_correct(self): - """Test that the span name is 'Evaluation Set Run'.""" - # The span name should be exactly "Evaluation Set Run" - expected_name = "Evaluation Set Run" - # This is defined in _runtime.py:316 - assert expected_name == "Evaluation Set Run" - - def test_span_has_eval_set_run_span_type(self): - """Test that span_type attribute is 'eval_set_run'.""" - span_attributes = {"span_type": "eval_set_run"} - assert span_attributes["span_type"] == "eval_set_run" - - def test_span_has_output_attribute(self): - """Test that span has output attribute with score.""" - import json - - # Simulate the output attribute set by configure_eval_set_run_span - output_data = {"score": 85} - output_json = json.dumps(output_data) - - span_attributes = { - "span_type": "eval_set_run", - "output": output_json, - } - - assert "output" in span_attributes - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["score"] == 85 - assert isinstance(parsed_output["score"], int) - - def test_span_has_agent_id(self): - """Test that span has agentId metadata attribute.""" - execution_id = "exec-123" - span_attributes = { - "span_type": "eval_set_run", - "agentId": execution_id, - } - assert "agentId" in span_attributes - assert span_attributes["agentId"] == "exec-123" - - def test_span_has_agent_name(self): - """Test that span has agentName metadata attribute.""" - span_attributes = { - "span_type": "eval_set_run", - "agentName": "N/A", - } - assert "agentName" in span_attributes - assert span_attributes["agentName"] == "N/A" - - def test_span_has_input_schema(self): - """Test that span has inputSchema metadata attribute.""" - import json - - input_schema = {"type": "object", "properties": {"x": {"type": "number"}}} - span_attributes = { - "span_type": "eval_set_run", - "inputSchema": json.dumps(input_schema), - } - assert "inputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["inputSchema"]) - assert parsed_schema["type"] == "object" - - def test_span_has_output_schema(self): - """Test that span has outputSchema metadata attribute.""" - import json - - output_schema = {"type": "string"} - span_attributes = { - "span_type": "eval_set_run", - "outputSchema": json.dumps(output_schema), - } - assert "outputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["outputSchema"]) - assert parsed_schema["type"] == "string" - - def test_span_includes_eval_set_run_id_when_present(self): - """Test that eval_set_run_id is included when context has it.""" - eval_set_run_id = str(uuid.uuid4()) - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if eval_set_run_id: - span_attributes["eval_set_run_id"] = eval_set_run_id - - assert "eval_set_run_id" in span_attributes - assert span_attributes["eval_set_run_id"] == eval_set_run_id - - def test_span_excludes_eval_set_run_id_when_not_present(self): - """Test that eval_set_run_id is not included when context doesn't have it.""" - eval_set_run_id = None - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if eval_set_run_id: - span_attributes["eval_set_run_id"] = eval_set_run_id - - assert "eval_set_run_id" not in span_attributes - - -class TestEvaluationSpan: - """Tests for the 'Evaluation' span.""" - - def test_span_name_is_correct(self): - """Test that the span name is 'Evaluation'.""" - expected_name = "Evaluation" - assert expected_name == "Evaluation" - - def test_span_has_evaluation_span_type(self): - """Test that span_type attribute is 'evaluation'.""" - span_attributes = {"span_type": "evaluation"} - assert span_attributes["span_type"] == "evaluation" - - def test_span_includes_execution_id(self): - """Test that execution.id is included in the span attributes.""" - execution_id = str(uuid.uuid4()) - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - } - assert "execution.id" in span_attributes - assert span_attributes["execution.id"] == execution_id - - def test_span_includes_eval_item_id(self): - """Test that eval_item_id is included in the span attributes.""" - eval_item_id = "test-eval-item-123" - span_attributes = { - "span_type": "evaluation", - "eval_item_id": eval_item_id, - } - assert "eval_item_id" in span_attributes - assert span_attributes["eval_item_id"] == eval_item_id - - def test_span_includes_eval_item_name(self): - """Test that eval_item_name is included in the span attributes.""" - eval_item_name = "Test Evaluation Item" - span_attributes = { - "span_type": "evaluation", - "eval_item_name": eval_item_name, - } - assert "eval_item_name" in span_attributes - assert span_attributes["eval_item_name"] == eval_item_name - - def test_span_has_all_required_attributes(self): - """Test that all required attributes are present in the span.""" - execution_id = str(uuid.uuid4()) - eval_item_id = "eval-item-456" - eval_item_name = "My Eval Item" - - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": eval_item_id, - "eval_item_name": eval_item_name, - } - - # Verify all required attributes - required_attrs = ["execution.id", "span_type", "eval_item_id", "eval_item_name"] - for attr in required_attrs: - assert attr in span_attributes, f"Missing required attribute: {attr}" - - def test_span_has_output_attribute(self): - """Test that span has output attribute with score.""" - import json - - # Simulate the output attribute set by configure_evaluation_span - output_data = {"score": 90} - output_json = json.dumps(output_data) - - span_attributes = { - "span_type": "evaluation", - "output": output_json, - } - - assert "output" in span_attributes - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["score"] == 90 - assert isinstance(parsed_output["score"], int) - - def test_span_has_agent_id(self): - """Test that span has agentId metadata attribute.""" - execution_id = "eval-exec-456" - span_attributes = { - "span_type": "evaluation", - "agentId": execution_id, - } - assert "agentId" in span_attributes - assert span_attributes["agentId"] == "eval-exec-456" - - def test_span_has_agent_name(self): - """Test that span has agentName metadata attribute.""" - span_attributes = { - "span_type": "evaluation", - "agentName": "N/A", - } - assert "agentName" in span_attributes - assert span_attributes["agentName"] == "N/A" - - def test_span_has_input_schema(self): - """Test that span has inputSchema metadata attribute.""" - import json - - input_schema = {"type": "object"} - span_attributes = { - "span_type": "evaluation", - "inputSchema": json.dumps(input_schema), - } - assert "inputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["inputSchema"]) - assert parsed_schema["type"] == "object" - - def test_span_has_output_schema(self): - """Test that span has outputSchema metadata attribute.""" - import json - - output_schema = {"type": "object"} - span_attributes = { - "span_type": "evaluation", - "outputSchema": json.dumps(output_schema), - } - assert "outputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["outputSchema"]) - assert parsed_schema["type"] == "object" - - -class TestEvaluatorSpan: - """Tests for the 'Evaluator: {name}' span.""" - - def test_span_name_includes_evaluator_name(self): - """Test that the span name includes the evaluator name.""" - evaluator_name = "MyEvaluator" - expected_name = f"Evaluator: {evaluator_name}" - assert expected_name == "Evaluator: MyEvaluator" - - def test_span_has_evaluator_span_type(self): - """Test that span_type attribute is 'evaluator'.""" - span_attributes = {"span_type": "evaluator"} - assert span_attributes["span_type"] == "evaluator" - - def test_span_includes_evaluator_id(self): - """Test that evaluator_id is included in the span attributes.""" - evaluator_id = "evaluator-789" - span_attributes = { - "span_type": "evaluator", - "evaluator_id": evaluator_id, - } - assert "evaluator_id" in span_attributes - assert span_attributes["evaluator_id"] == evaluator_id - - def test_span_includes_evaluator_name(self): - """Test that evaluator_name is included in the span attributes.""" - evaluator_name = "AccuracyEvaluator" - span_attributes = { - "span_type": "evaluator", - "evaluator_name": evaluator_name, - } - assert "evaluator_name" in span_attributes - assert span_attributes["evaluator_name"] == evaluator_name - - def test_span_includes_eval_item_id(self): - """Test that eval_item_id is included in the evaluator span.""" - eval_item_id = "eval-item-123" - span_attributes = { - "span_type": "evaluator", - "eval_item_id": eval_item_id, - } - assert "eval_item_id" in span_attributes - assert span_attributes["eval_item_id"] == eval_item_id - - def test_span_has_all_required_attributes(self): - """Test that all required attributes are present in the evaluator span.""" - evaluator_id = "eval-id-123" - evaluator_name = "TestEvaluator" - eval_item_id = "item-456" - - span_attributes = { - "span_type": "evaluator", - "evaluator_id": evaluator_id, - "evaluator_name": evaluator_name, - "eval_item_id": eval_item_id, - } - - # Verify all required attributes - required_attrs = ["span_type", "evaluator_id", "evaluator_name", "eval_item_id"] - for attr in required_attrs: - assert attr in span_attributes, f"Missing required attribute: {attr}" - - -class TestSpanHierarchy: - """Tests verifying the span hierarchy structure.""" - - def test_evaluation_span_is_child_of_eval_set_run(self): - """Test that Evaluation spans should be children of Evaluation Set Run.""" - # This is a conceptual test - in the actual code, the Evaluation span - # is created inside the context of the Evaluation Set Run span - parent_span_type = "eval_set_run" - child_span_type = "evaluation" - - # The parent-child relationship is enforced by span context nesting - assert parent_span_type == "eval_set_run" - assert child_span_type == "evaluation" - - def test_evaluator_span_is_child_of_evaluation(self): - """Test that Evaluator spans should be children of Evaluation.""" - # This is a conceptual test - in the actual code, the Evaluator span - # is created inside the context of the Evaluation span - parent_span_type = "evaluation" - child_span_type = "evaluator" - - assert parent_span_type == "evaluation" - assert child_span_type == "evaluator" - - -class TestSpanAttributeValues: - """Tests for span attribute value formatting.""" - - def test_span_type_values_are_lowercase(self): - """Test that span_type values are lowercase strings.""" - span_types = ["eval_set_run", "evaluation", "evaluator"] - - for span_type in span_types: - assert span_type == span_type.lower() - # All span types should be lowercase without hyphens - assert "-" not in span_type - - def test_execution_id_is_valid_uuid(self): - """Test that execution.id is a valid UUID string.""" - execution_id = str(uuid.uuid4()) - - # Verify it can be parsed back as a UUID - parsed_uuid = uuid.UUID(execution_id) - assert str(parsed_uuid) == execution_id - - def test_evaluator_span_name_format(self): - """Test the evaluator span name format.""" - evaluator_names = [ - "Accuracy", - "Relevance", - "Fluency", - "Custom Evaluator", + mock_span = MockSpan(name, attributes) + self.captured_spans.append(mock_span) + yield mock_span + + def get_spans_by_type(self, span_type: str) -> list[MockSpan]: + return [ + s + for s in self.captured_spans + if s.attributes.get("span_type") == span_type ] - for name in evaluator_names: - span_name = f"Evaluator: {name}" - assert span_name.startswith("Evaluator: ") - assert name in span_name - - -class TestEvalContextIntegration: - """Tests for UiPathEvalContext integration with spans.""" - - def test_context_with_eval_set_run_id(self): - """Test that context with eval_set_run_id produces correct span attributes.""" - context = UiPathEvalContext() - context.eval_set_run_id = "run-123" - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if context.eval_set_run_id: - span_attributes["eval_set_run_id"] = context.eval_set_run_id - - assert span_attributes["eval_set_run_id"] == "run-123" - - def test_context_without_eval_set_run_id(self): - """Test that context without eval_set_run_id produces correct span attributes.""" - context = UiPathEvalContext() - context.eval_set_run_id = None - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if context.eval_set_run_id: - span_attributes["eval_set_run_id"] = context.eval_set_run_id - - assert "eval_set_run_id" not in span_attributes - - -class TestSpanCreationLogic: - """Tests for the span creation logic in runtime methods.""" - - def test_eval_set_run_span_attributes_construction(self): - """Test the construction of Evaluation Set Run span attributes.""" - eval_set_run_id = "test-run-id" - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if eval_set_run_id: - span_attributes["eval_set_run_id"] = eval_set_run_id - - assert span_attributes == { - "span_type": "eval_set_run", - "eval_set_run_id": "test-run-id", - } - - def test_evaluation_span_attributes_construction(self): - """Test the construction of Evaluation span attributes.""" - execution_id = "exec-123" - eval_item_id = "item-456" - eval_item_name = "Test Item" - - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": eval_item_id, - "eval_item_name": eval_item_name, - } - - assert span_attributes["execution.id"] == "exec-123" - assert span_attributes["span_type"] == "evaluation" - assert span_attributes["eval_item_id"] == "item-456" - assert span_attributes["eval_item_name"] == "Test Item" - - def test_evaluator_span_attributes_construction(self): - """Test the construction of Evaluator span attributes.""" - evaluator_id = "eval-123" - evaluator_name = "AccuracyEvaluator" - eval_item_id = "item-789" - - span_attributes = { - "span_type": "evaluator", - "evaluator_id": evaluator_id, - "evaluator_name": evaluator_name, - "eval_item_id": eval_item_id, - } - - assert span_attributes["span_type"] == "evaluator" - assert span_attributes["evaluator_id"] == "eval-123" - assert span_attributes["evaluator_name"] == "AccuracyEvaluator" - assert span_attributes["eval_item_id"] == "item-789" - - def test_evaluator_span_name_construction(self): - """Test the construction of Evaluator span name.""" - evaluator_name = "RelevanceEvaluator" - span_name = f"Evaluator: {evaluator_name}" - - assert span_name == "Evaluator: RelevanceEvaluator" - - -class TestEvalItemSpanAttributes: - """Tests for eval item attributes in spans.""" - - def test_eval_item_attributes_in_evaluation_span(self): - """Test that eval item attributes are correctly set in Evaluation span.""" - eval_item = MagicMock(spec=EvaluationItem) - eval_item.id = "item-id-123" - eval_item.name = "Test Evaluation" + def get_spans_by_attr(self, key: str, value: str) -> list[MockSpan]: + return [s for s in self.captured_spans if s.attributes.get(key) == value] + + def get_span_by_name(self, name: str) -> MockSpan | None: + for span in self.captured_spans: + if span.name == name: + return span + return None + + +def make_mock_execution_output( + output: dict[str, Any] | None = None, + error: Any = None, + status: str = "successful", +) -> MagicMock: + """Create a mock execution output from execute_runtime.""" + mock = MagicMock() + mock.result.output = output or {"result": "ok"} + mock.result.error = error + mock.result.status = status + mock.result.trigger = None + mock.result.triggers = None + mock.spans = [] + mock.logs = [] + mock.execution_time = 1.0 + return mock + + +def make_evaluator( + name: str = "MockEvaluator", + evaluator_id: str = "mock-evaluator", + score: float = 0.95, + details: Any = None, +) -> MagicMock: + """Create a mock evaluator that returns a fixed score.""" + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = evaluator_id + evaluator.name = name + evaluator.validate_and_evaluate_criteria = AsyncMock( + return_value=NumericEvaluationResult(score=score, details=details) + ) + # reduce_scores is called by compute_evaluator_scores to aggregate across items + evaluator.reduce_scores = lambda results: ( + sum(r.score for r in results) / len(results) if results else 0.0 + ) + return evaluator + + +def make_eval_item( + item_id: str = "eval-item-default", + name: str = "Default Test Case", + inputs: dict[str, Any] | None = None, + evaluation_criterias: dict[str, Any] | None = None, +) -> EvaluationItem: + """Create an EvaluationItem for testing.""" + return EvaluationItem( + id=item_id, + name=name, + inputs=inputs or {}, + evaluation_criterias=evaluation_criterias or {}, + ) + + +async def run_evaluation( + eval_items: list[EvaluationItem], + evaluators: list[MagicMock], + execution_output: MagicMock | None = None, + **context_kwargs: Any, +) -> tuple[SpanCapturingTracer, UiPathEvalRuntime]: + """Run execute() through the full pipeline and return the tracer + runtime. + + Sets up the runtime with real eval items and evaluators, mocks only + execute_runtime, and runs execute() to completion. + """ + tracer = SpanCapturingTracer() + + mock_trace_manager = MagicMock() + mock_trace_manager.tracer_provider.get_tracer.return_value = tracer + mock_trace_manager.tracer_span_processors = [] + + mock_factory = MagicMock() + mock_factory.new_runtime = AsyncMock(return_value=AsyncMock()) + + mock_event_bus = MagicMock() + mock_event_bus.publish = AsyncMock() + + context = UiPathEvalContext() + context.execution_id = context_kwargs.pop( + "execution_id", str(uuid.uuid4()) + ) + context.runtime_schema = context_kwargs.pop( + "runtime_schema", + UiPathRuntimeSchema( + filePath="test.py", + uniqueId="test", + type="workflow", + input={"type": "object", "properties": {"x": {"type": "number"}}}, + output={"type": "object", "properties": {}}, + ), + ) + context.evaluation_set = EvaluationSet( + id="test-set", + name="Test Eval Set", + evaluations=eval_items, + ) + context.evaluators = evaluators + + for key, value in context_kwargs.items(): + setattr(context, key, value) + + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=execution_output or make_mock_execution_output()), + ): + await runtime.execute() - span_attributes = { - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": eval_item.id, - "eval_item_name": eval_item.name, - } + return tracer, runtime - assert span_attributes["eval_item_id"] == "item-id-123" - assert span_attributes["eval_item_name"] == "Test Evaluation" - def test_eval_item_id_in_evaluator_span(self): - """Test that eval_item_id is included in Evaluator span.""" - eval_item = MagicMock(spec=EvaluationItem) - eval_item.id = "item-id-456" +# --- Test classes --- - span_attributes = { - "span_type": "evaluator", - "evaluator_id": "evaluator-123", - "evaluator_name": "TestEvaluator", - "eval_item_id": eval_item.id, - } - assert span_attributes["eval_item_id"] == "item-id-456" +class TestEvalSetRunSpan: + """Tests for the top-level 'Evaluation Set Run' span produced by execute().""" + + @pytest.mark.asyncio + async def test_span_created_with_correct_name_and_type(self) -> None: + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + spans = tracer.get_spans_by_type("eval_set_run") + assert len(spans) == 1 + assert spans[0].name == "Evaluation Set Run" + assert spans[0].attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_aggregate_scores_from_multiple_items(self) -> None: + """Scores are averaged across all eval items and written to the span.""" + evaluator = make_evaluator(score=0.8) + items = [ + make_eval_item( + item_id="calculator-addition", + name="Calculator addition test", + evaluation_criterias={"mock-evaluator": {}}, + ), + make_eval_item( + item_id="calculator-subtraction", + name="Calculator subtraction test", + evaluation_criterias={"mock-evaluator": {}}, + ), + ] + tracer, _ = await run_evaluation(items, [evaluator]) + + span = tracer.get_spans_by_type("eval_set_run")[0] + output = json.loads(span.attributes["output"]) + assert output["scores"]["MockEvaluator"] == 80.0 # 0.8 -> 80.0 + + @pytest.mark.asyncio + async def test_metadata_attributes(self) -> None: + evaluator = make_evaluator(score=1.0) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation( + [item], [evaluator], execution_id="eval-set-run-exec-123" + ) + span = tracer.get_spans_by_type("eval_set_run")[0] + assert span.attributes["agentId"] == "eval-set-run-exec-123" + assert span.attributes["agentName"] == "N/A" -class TestSpanTypeConsistency: - """Tests for span type value consistency.""" + input_schema = json.loads(span.attributes["inputSchema"]) + assert input_schema["type"] == "object" + assert "x" in input_schema["properties"] - def test_all_span_types_are_strings(self): - """Test that all span_type values are strings.""" - span_types = ["eval_set_run", "evaluation", "evaluator"] + output_schema = json.loads(span.attributes["outputSchema"]) + assert output_schema["type"] == "object" - for span_type in span_types: - assert isinstance(span_type, str) + @pytest.mark.asyncio + async def test_span_has_ok_status_on_success(self) -> None: + from opentelemetry.trace import StatusCode - def test_span_types_use_snake_case(self): - """Test that span_type values use snake_case naming.""" - span_types = ["eval_set_run", "evaluation", "evaluator"] + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) - for span_type in span_types: - # No uppercase letters - assert span_type == span_type.lower() - # No hyphens - assert "-" not in span_type + span = tracer.get_spans_by_type("eval_set_run")[0] + assert span._status is not None + assert span._status.status_code == StatusCode.OK - def test_span_type_values_match_expected(self): - """Test that span_type values match expected values from _runtime.py.""" - expected_span_types = { - "Evaluation Set Run": "eval_set_run", - "Evaluation": "evaluation", - "Evaluator": "evaluator", - } + @pytest.mark.asyncio + async def test_eval_set_run_id_included_when_provided(self) -> None: + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation( + [item], [evaluator], eval_set_run_id="run-2024-04-14-001" + ) - for _, span_type in expected_span_types.items(): - assert isinstance(span_type, str) - assert span_type.islower() or "_" in span_type - - -class TestRunEvaluatorSpan: - """Tests specifically for the run_evaluator span creation.""" - - @pytest.fixture - def mock_evaluator(self): - """Create a mock evaluator for testing.""" - evaluator = MagicMock(spec=BaseEvaluator) - evaluator.id = "test-evaluator-id" - evaluator.name = "TestEvaluator" - return evaluator - - @pytest.fixture - def mock_eval_item(self): - """Create a mock eval item for testing.""" - eval_item = MagicMock(spec=EvaluationItem) - eval_item.id = "test-item-id" - eval_item.name = "Test Item" - eval_item.inputs = {"query": "test query"} - eval_item.expected_agent_behavior = "Expected behavior" - return eval_item - - def test_evaluator_span_name_uses_evaluator_name(self, mock_evaluator): - """Test that evaluator span name uses the evaluator's name.""" - span_name = f"Evaluator: {mock_evaluator.name}" - assert span_name == "Evaluator: TestEvaluator" - - def test_evaluator_span_includes_evaluator_details( - self, mock_evaluator, mock_eval_item - ): - """Test that evaluator span includes all evaluator details.""" - span_attributes = { - "span_type": "evaluator", - "evaluator_id": mock_evaluator.id, - "evaluator_name": mock_evaluator.name, - "eval_item_id": mock_eval_item.id, - } + span = tracer.get_spans_by_type("eval_set_run")[0] + assert span.attributes["eval_set_run_id"] == "run-2024-04-14-001" - assert span_attributes["evaluator_id"] == "test-evaluator-id" - assert span_attributes["evaluator_name"] == "TestEvaluator" - assert span_attributes["eval_item_id"] == "test-item-id" + @pytest.mark.asyncio + async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + span = tracer.get_spans_by_type("eval_set_run")[0] + assert "eval_set_run_id" not in span.attributes -class TestExecutionIdPropagation: - """Tests for execution.id propagation in spans.""" - def test_execution_id_format(self): - """Test that execution.id is in valid UUID format.""" - execution_id = str(uuid.uuid4()) +class TestEvaluationSpan: + """Tests for the 'Evaluation' span — one per eval item.""" + + @pytest.mark.asyncio + async def test_one_span_per_eval_item(self) -> None: + evaluator = make_evaluator(score=0.9) + items = [ + make_eval_item( + item_id="greeting-english", + name="English greeting test", + evaluation_criterias={"mock-evaluator": {}}, + ), + make_eval_item( + item_id="greeting-spanish", + name="Spanish greeting test", + evaluation_criterias={"mock-evaluator": {}}, + ), + ] + tracer, _ = await run_evaluation(items, [evaluator]) + + spans = tracer.get_spans_by_type("evaluation") + assert len(spans) == 2 + + @pytest.mark.asyncio + async def test_span_has_eval_item_attributes(self) -> None: + evaluator = make_evaluator(score=0.9) + item = make_eval_item( + item_id="sentiment-positive-review", + name="Positive review sentiment check", + evaluation_criterias={"mock-evaluator": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span.attributes["eval_item_id"] == "sentiment-positive-review" + assert span.attributes["eval_item_name"] == "Positive review sentiment check" + assert span.attributes["execution.id"] == "sentiment-positive-review" + assert span.attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_span_configured_with_per_item_scores(self) -> None: + evaluator = make_evaluator(score=0.85) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + output = json.loads(span.attributes["output"]) + assert "scores" in output + assert "MockEvaluator" in output["scores"] + + @pytest.mark.asyncio + async def test_span_has_metadata(self) -> None: + evaluator = make_evaluator(score=0.9) + item = make_eval_item( + item_id="capital-city-query", + inputs={"query": "What is the capital of France?"}, + evaluation_criterias={"mock-evaluator": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + # agentId is set to the eval item's execution_id (== str(eval_item.id)) + assert span.attributes["agentId"] == "capital-city-query" + assert span.attributes["agentName"] == "N/A" + + @pytest.mark.asyncio + async def test_span_has_input_data(self) -> None: + """The evaluation span carries the eval item's inputs as serialized JSON.""" + evaluator = make_evaluator(score=0.9) + item = make_eval_item( + inputs={"query": "Summarize this document", "max_tokens": 100}, + evaluation_criterias={"mock-evaluator": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + assert "input" in span.attributes + input_data = json.loads(span.attributes["input"]) + assert input_data["query"] == "Summarize this document" + assert input_data["max_tokens"] == 100 + + @pytest.mark.asyncio + async def test_span_has_error_status_when_agent_fails(self) -> None: + """When the agent returns an error, the evaluation span gets ERROR status.""" + from opentelemetry.trace import StatusCode + + evaluator = make_evaluator(score=0.0) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + error_output = make_mock_execution_output( + error="Agent failed: connection timeout", + ) + tracer, _ = await run_evaluation( + [item], [evaluator], execution_output=error_output + ) - # Verify it's a valid UUID - try: - uuid.UUID(execution_id) - valid = True - except ValueError: - valid = False + span = tracer.get_spans_by_type("evaluation")[0] + assert span._status is not None + assert span._status.status_code == StatusCode.ERROR - assert valid + @pytest.mark.asyncio + async def test_span_has_ok_status_on_success(self) -> None: + """When the agent succeeds, the evaluation span gets OK status.""" + from opentelemetry.trace import StatusCode - def test_execution_id_is_unique_per_eval(self): - """Test that each eval gets a unique execution_id.""" - execution_ids = [str(uuid.uuid4()) for _ in range(5)] + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) - # All should be unique - assert len(set(execution_ids)) == 5 + span = tracer.get_spans_by_type("evaluation")[0] + assert span._status is not None + assert span._status.status_code == StatusCode.OK - def test_evaluation_span_has_execution_id(self): - """Test that Evaluation span includes execution.id.""" - execution_id = str(uuid.uuid4()) - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": "item-123", - "eval_item_name": "Test Item", - } +class TestEvaluatorSpan: + """Tests for the 'Evaluator: {name}' span — one per evaluator per item.""" + + @pytest.mark.asyncio + async def test_span_has_correct_name_and_attributes(self) -> None: + evaluator = make_evaluator(score=0.9) + item = make_eval_item( + item_id="multi-step-tool-use", + name="Multi-step tool use test case", + evaluation_criterias={"mock-evaluator": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) + + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 1 + span = spans[0] + assert span.name == "Evaluator: MockEvaluator" + assert span.attributes["evaluator_id"] == "mock-evaluator" + assert span.attributes["evaluator_name"] == "MockEvaluator" + assert span.attributes["eval_item_id"] == "multi-step-tool-use" + assert span.attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_multiple_evaluators_produce_multiple_spans(self) -> None: + evaluators = [ + make_evaluator( + name="MockEvaluatorA", evaluator_id="mock-evaluator-a", score=0.9 + ), + make_evaluator( + name="MockEvaluatorB", evaluator_id="mock-evaluator-b", score=0.8 + ), + make_evaluator( + name="MockEvaluatorC", evaluator_id="mock-evaluator-c", score=0.7 + ), + ] + item = make_eval_item( + evaluation_criterias={ + "mock-evaluator-a": {}, + "mock-evaluator-b": {}, + "mock-evaluator-c": {}, + } + ) + tracer, _ = await run_evaluation([item], evaluators) + + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 3 + span_names = {s.name for s in spans} + assert span_names == { + "Evaluator: MockEvaluatorA", + "Evaluator: MockEvaluatorB", + "Evaluator: MockEvaluatorC", + } + + @pytest.mark.asyncio + async def test_multiple_items_each_get_evaluator_spans(self) -> None: + evaluator = make_evaluator(score=0.9) + items = [ + make_eval_item( + item_id="invoice-extraction", + name="Invoice data extraction", + evaluation_criterias={"mock-evaluator": {}}, + ), + make_eval_item( + item_id="receipt-extraction", + name="Receipt data extraction", + evaluation_criterias={"mock-evaluator": {}}, + ), + ] + tracer, _ = await run_evaluation(items, [evaluator]) - assert "execution.id" in span_attributes - assert span_attributes["execution.id"] == execution_id + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 2 + item_ids = {s.attributes["eval_item_id"] for s in spans} + assert item_ids == {"invoice-extraction", "receipt-extraction"} class TestEvaluationOutputSpan: - """Tests for the 'Evaluation output' span.""" - - def test_span_name_is_correct(self): - """Test that the span name is 'Evaluation output'.""" - expected_name = "Evaluation output" - assert expected_name == "Evaluation output" - - def test_span_has_eval_output_span_type(self): - """Test that span_type attribute is 'evalOutput'.""" - span_attributes = {"span.type": "evalOutput"} - assert span_attributes["span.type"] == "evalOutput" - - def test_span_includes_value(self): - """Test that value (score) is included in the span attributes.""" - score = 0.95 - span_attributes = { - "span.type": "evalOutput", - "value": score, - } - assert "value" in span_attributes - assert span_attributes["value"] == 0.95 - - def test_span_includes_evaluator_id(self): - """Test that evaluatorId is included in the span attributes.""" - evaluator_id = "evaluator-123" - span_attributes = { - "span.type": "evalOutput", - "evaluatorId": evaluator_id, - } - assert "evaluatorId" in span_attributes - assert span_attributes["evaluatorId"] == evaluator_id - - def test_span_includes_justification(self): - """Test that justification is included in the span attributes.""" - justification = "The output matches expected behavior." - span_attributes = { - "span.type": "evalOutput", - "justification": justification, - } - assert "justification" in span_attributes - assert span_attributes["justification"] == justification - - def test_span_has_all_required_attributes(self): - """Test that all required attributes are present in the span.""" - evaluator_id = "eval-id-123" - score = 100 - justification = "Perfect match" - - span_attributes = { - "span.type": "evalOutput", - "value": score, - "evaluatorId": evaluator_id, - "justification": justification, - } - - # Verify all required attributes - required_attrs = ["span.type", "value", "evaluatorId", "justification"] - for attr in required_attrs: - assert attr in span_attributes, f"Missing required attribute: {attr}" - - def test_span_has_openinference_kind(self): - """Test that openinference.span.kind is set to CHAIN.""" - span_attributes = { - "openinference.span.kind": "CHAIN", - "span.type": "evalOutput", - } - assert span_attributes["openinference.span.kind"] == "CHAIN" - - def test_span_has_output_attribute_with_type_value_justification(self): - """Test that span has output attribute with type, value, and justification.""" - import json - - # Simulate the output attribute set by set_evaluation_output_span_output - output_data = { - "type": 1, - "value": 0.92, - "justification": "The outputs are semantically equivalent", - } - output_json = json.dumps(output_data) - - span_attributes = { - "span.type": "evalOutput", - "output": output_json, - } + """Tests for the 'Evaluation output' span — the evaluator's score.""" + + @pytest.mark.asyncio + async def test_span_created_with_correct_attributes(self) -> None: + evaluator = make_evaluator(score=0.75) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") + assert len(output_spans) == 1 + span = output_spans[0] + assert span.name == "Evaluation output" + assert span.attributes["value"] == 0.75 + assert span.attributes["evaluatorId"] == "mock-evaluator" + assert span.attributes["openinference.span.kind"] == "CHAIN" + assert span.attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_output_json_has_normalized_score_and_type(self) -> None: + evaluator = make_evaluator(score=0.85) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + output = json.loads(span.attributes["output"]) + assert output["type"] == 1 + assert output["score"] == 85.0 # 0.85 normalized to 0-100 + assert output["evaluatorId"] == "mock-evaluator" + + @pytest.mark.asyncio + async def test_justification_from_pydantic_details(self) -> None: + class MockEvaluatorDetails(BaseModel): + justification: str + similarity_score: float = 0.0 + + details = MockEvaluatorDetails( + justification="Agent output is semantically equivalent to expected output", + similarity_score=0.92, + ) + evaluator = make_evaluator(score=0.92, details=details) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) - assert "output" in span_attributes - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["type"] == 1 - assert parsed_output["value"] == 0.92 + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] assert ( - parsed_output["justification"] == "The outputs are semantically equivalent" + span.attributes["justification"] + == "Agent output is semantically equivalent to expected output" + ) + # justification also appears in the output JSON + output = json.loads(span.attributes["output"]) + assert ( + output["justification"] + == "Agent output is semantically equivalent to expected output" ) - def test_span_output_type_is_always_one(self): - """Test that output type field is always 1.""" - import json - - output_data = {"type": 1, "value": 0.5} - output_json = json.dumps(output_data) - - span_attributes = { - "span.type": "evalOutput", - "output": output_json, - } - - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["type"] == 1 - - def test_span_output_without_justification(self): - """Test that output can be set without justification field.""" - import json - - # When justification is None, it should be excluded from output - output_data = {"type": 1, "value": 0.75} - output_json = json.dumps(output_data) - - span_attributes = { - "span.type": "evalOutput", - "output": output_json, - } - - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["type"] == 1 - assert parsed_output["value"] == 0.75 - assert "justification" not in parsed_output - - -class TestEvaluationOutputSpanHierarchy: - """Tests verifying the Evaluation output span hierarchy.""" - - def test_eval_output_is_child_of_evaluator(self): - """Test that Evaluation output spans should be children of Evaluator spans.""" - parent_span_type = "evaluator" - child_span_type = "evalOutput" - - # The parent-child relationship is enforced by span context nesting - assert parent_span_type == "evaluator" - assert child_span_type == "evalOutput" - - def test_eval_output_span_attributes_construction(self): - """Test the construction of Evaluation output span attributes.""" - evaluator_id = "eval-123" - score = 0.85 - justification = "Good accuracy" - - span_attributes = { - "openinference.span.kind": "CHAIN", - "span.type": "evalOutput", - "value": score, - "evaluatorId": evaluator_id, - "justification": justification, - } + @pytest.mark.asyncio + async def test_justification_from_string_details(self) -> None: + evaluator = make_evaluator( + score=0.8, + details="Output correctly classified as positive sentiment", + ) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) - assert span_attributes["openinference.span.kind"] == "CHAIN" - assert span_attributes["span.type"] == "evalOutput" - assert span_attributes["value"] == 0.85 - assert span_attributes["evaluatorId"] == "eval-123" - assert span_attributes["justification"] == "Good accuracy" - - -class TestEvaluationOutputSpanCreation: - """Tests for Evaluation output span creation in progress reporter.""" - - def test_eval_output_span_name_is_evaluation_output(self): - """Test that the span name is exactly 'Evaluation output'.""" - span_name = "Evaluation output" - assert span_name == "Evaluation output" - - def test_eval_output_span_type_is_camel_case(self): - """Test that span.type uses camelCase: evalOutput.""" - span_type = "evalOutput" - assert span_type == "evalOutput" - # First letter lowercase, second word capitalized - assert span_type[0].islower() - assert "Output" in span_type - - def test_eval_output_with_pydantic_details(self): - """Test that justification is extracted from Pydantic model details.""" - # Simulate Pydantic model details with justification field - details_dict = { - "justification": "The semantic similarity is perfect.", - "other_field": "some value", - } + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert ( + span.attributes["justification"] + == "Output correctly classified as positive sentiment" + ) - # Extract justification like the code does - justification = details_dict.get("justification", str(details_dict)) - assert justification == "The semantic similarity is perfect." + @pytest.mark.asyncio + async def test_no_justification_when_no_details(self) -> None: + evaluator = make_evaluator(score=1.0, details=None) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) - def test_eval_output_with_string_details(self): - """Test that string details are used as justification directly.""" - details = "Good accuracy on all test cases" + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert "justification" not in span.attributes + # justification also absent from output JSON (exclude_none=True) + output = json.loads(span.attributes["output"]) + assert "justification" not in output - # String details are used directly - justification = str(details) - assert justification == "Good accuracy on all test cases" - def test_eval_output_without_justification_field(self): - """Test fallback when details dict has no justification field.""" - import json +class TestSpanHierarchy: + """Tests that the full span tree is produced in the correct structure.""" + + @pytest.mark.asyncio + async def test_full_span_tree(self) -> None: + """One item + one evaluator produces all four span types.""" + evaluator = make_evaluator(score=0.95) + item = make_eval_item( + item_id="booking-flow-happy-path", + name="Booking flow happy path", + evaluation_criterias={"mock-evaluator": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) + + assert len(tracer.get_spans_by_type("eval_set_run")) == 1 + assert len(tracer.get_spans_by_type("evaluation")) == 1 + assert len(tracer.get_spans_by_type("evaluator")) == 1 + assert len(tracer.get_spans_by_attr("span.type", "evalOutput")) == 1 + + @pytest.mark.asyncio + async def test_span_ordering(self) -> None: + """Spans are created in the correct order: parent before child.""" + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + names = [s.name for s in tracer.captured_spans] + assert names.index("Evaluation Set Run") < names.index("Evaluation") + assert names.index("Evaluation") < names.index( + "Evaluator: MockEvaluator" + ) + assert names.index("Evaluator: MockEvaluator") < names.index( + "Evaluation output" + ) - details_dict: dict[str, float] = { - "accuracy": 0.95, - "precision": 0.92, - } + @pytest.mark.asyncio + async def test_multiple_items_and_evaluators(self) -> None: + """Two items x two evaluators produces the expected span counts.""" + evaluators = [ + make_evaluator( + name="MockEvaluatorA", evaluator_id="mock-evaluator-a", score=0.9 + ), + make_evaluator( + name="MockEvaluatorB", evaluator_id="mock-evaluator-b", score=0.8 + ), + ] + items = [ + make_eval_item( + item_id="api-response-format", + name="API response format validation", + evaluation_criterias={ + "mock-evaluator-a": {}, + "mock-evaluator-b": {}, + }, + ), + make_eval_item( + item_id="error-handling-graceful", + name="Graceful error handling check", + evaluation_criterias={ + "mock-evaluator-a": {}, + "mock-evaluator-b": {}, + }, + ), + ] + tracer, _ = await run_evaluation(items, evaluators) - # Should fall back to JSON dump of entire details - # Since there's no "justification" key, we get the default JSON string - justification = json.dumps(details_dict) - assert "accuracy" in justification - assert "0.95" in justification + assert len(tracer.get_spans_by_type("eval_set_run")) == 1 + assert len(tracer.get_spans_by_type("evaluation")) == 2 + assert len(tracer.get_spans_by_type("evaluator")) == 4 # 2 items x 2 evaluators + assert len(tracer.get_spans_by_attr("span.type", "evalOutput")) == 4