From 2b49c7b8d1265f6b65efb0c2bb9e4525efb85b63 Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 06:24:00 -0700 Subject: [PATCH 1/7] test: rewrite eval runtime span tests to exercise real code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All ~60 tests in test_eval_runtime_spans.py were tautological — they constructed dicts inline and asserted against those same dicts without ever calling production code (e.g. `assert {"span_type": "eval_set_run"}["span_type"] == "eval_set_run"`). Replace with 24 tests that inject a SpanCapturingTracer into the runtime, call the real methods (execute, _execute_eval, run_evaluator), and assert on the captured spans. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 1448 ++++++++--------- 1 file changed, 657 insertions(+), 791 deletions(-) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index e8c1b0810..0873b2ae7 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -1,842 +1,708 @@ -"""Tests for eval runtime span creation in _runtime.py. +"""Tests for eval runtime span creation. -Tests the spans added for eval tracing: -1. "Evaluation Set Run" - span_type: "eval_set_run" -2. "Evaluation" - span_type: "evaluation" -3. "Evaluator: {name}" - span_type: "evaluator" -4. "Evaluation output" - span_type: "evalOutput" +Verifies that the eval runtime methods produce the correct OpenTelemetry spans: +1. "Evaluation Set Run" - span_type: "eval_set_run" (from execute()) +2. "Evaluation" - span_type: "evaluation" (from _execute_eval()) +3. "Evaluator: {name}" - span_type: "evaluator" (from run_evaluator()) +4. "Evaluation output" - span_type: "evalOutput" (from run_evaluator()) """ +import json import uuid -from typing import Any, Dict, List -from unittest.mock import MagicMock +from contextlib import contextmanager +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch import pytest -from opentelemetry.sdk.trace import Span +from pydantic import BaseModel from uipath.eval.evaluators import BaseEvaluator -from uipath.eval.models.evaluation_set import EvaluationItem -from uipath.eval.runtime.context import UiPathEvalContext +from uipath.eval.models import NumericEvaluationResult +from uipath.eval.models.evaluation_set import EvaluationItem, EvaluationSet +from uipath.eval.runtime import UiPathEvalContext, UiPathEvalRuntime +from uipath.runtime.schema import UiPathRuntimeSchema -class MockSpanContext: - """Mock span context manager for testing span creation.""" +# --- Test infrastructure --- - def __init__(self, name: str, attributes: dict[str, Any] | None): + +class MockSpan: + """Mock span that captures set_attribute and set_status calls.""" + + def __init__(self, name: str, attributes: dict[str, Any] | None = None): self.name = name - self.attributes = attributes or {} - self.span = MagicMock(spec=Span) - self.span.attributes = self.attributes + self.attributes = dict(attributes) if attributes else {} + self._status = None + + def set_attribute(self, key: str, value: Any) -> None: + self.attributes[key] = value - def __enter__(self): - return self.span + def set_status(self, status: Any) -> None: + self._status = status - def __exit__(self, *args): + def __enter__(self) -> "MockSpan": + return self + + def __exit__(self, *args: Any) -> None: pass class SpanCapturingTracer: - """A tracer that captures span creations for testing.""" + """Tracer that captures all created spans for verification.""" - def __init__(self): - self.created_spans: List[Dict[str, Any]] = [] + def __init__(self) -> None: + self.captured_spans: list[MockSpan] = [] + @contextmanager def start_as_current_span( self, name: str, attributes: dict[str, Any] | None = None ): - """Capture span creation and return a mock context manager.""" - span_info = {"name": name, "attributes": attributes or {}} - self.created_spans.append(span_info) - return MockSpanContext(name, attributes) + mock_span = MockSpan(name, attributes) + self.captured_spans.append(mock_span) + yield mock_span + + def get_spans_by_type(self, span_type: str) -> list[MockSpan]: + return [ + s + for s in self.captured_spans + if s.attributes.get("span_type") == span_type + ] + + def get_spans_by_attr(self, key: str, value: str) -> list[MockSpan]: + return [s for s in self.captured_spans if s.attributes.get(key) == value] + + def get_span_by_name(self, name: str) -> MockSpan | None: + for span in self.captured_spans: + if span.name == name: + return span + return None + + +def create_eval_context(**kwargs: Any) -> UiPathEvalContext: + """Create UiPathEvalContext with sensible defaults, overridable via kwargs.""" + context = UiPathEvalContext() + + if "execution_id" not in kwargs: + context.execution_id = str(uuid.uuid4()) + if "runtime_schema" not in kwargs: + context.runtime_schema = UiPathRuntimeSchema( + filePath="test.py", + uniqueId="test", + type="workflow", + input={"type": "object", "properties": {}}, + output={"type": "object", "properties": {}}, + ) + if "evaluation_set" not in kwargs: + context.evaluation_set = EvaluationSet( + id="test-eval-set", + name="Test Evaluation Set", + evaluations=[], + ) + if "evaluators" not in kwargs: + context.evaluators = [] + + for key, value in kwargs.items(): + setattr(context, key, value) + + return context + + +def make_mock_execution_output( + output: dict[str, Any] | None = None, + error: Any = None, + status: str = "successful", +) -> MagicMock: + """Create a mock execution output from execute_runtime.""" + mock = MagicMock() + mock.result.output = output or {"result": "ok"} + mock.result.error = error + mock.result.status = status + mock.result.trigger = None + mock.result.triggers = None + mock.spans = [] + mock.logs = [] + mock.execution_time = 1.0 + return mock + + +def make_runtime( + capturing_tracer: SpanCapturingTracer, **context_kwargs: Any +) -> UiPathEvalRuntime: + """Create a UiPathEvalRuntime wired to a SpanCapturingTracer.""" + mock_trace_manager = MagicMock() + mock_trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer + mock_trace_manager.tracer_span_processors = [] + + mock_factory = MagicMock() + mock_factory.new_runtime = AsyncMock(return_value=AsyncMock()) + + mock_event_bus = MagicMock() + mock_event_bus.publish = AsyncMock() + + context = create_eval_context(**context_kwargs) + + return UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) + + +def make_evaluator( + name: str = "AccuracyEvaluator", + evaluator_id: str = "accuracy-eval", + score: float = 0.95, + details: Any = None, +) -> MagicMock: + """Create a mock evaluator that returns a fixed score.""" + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = evaluator_id + evaluator.name = name + evaluator.validate_and_evaluate_criteria = AsyncMock( + return_value=NumericEvaluationResult(score=score, details=details) + ) + return evaluator + + +def make_eval_item( + item_id: str = "item-123", + name: str = "Test Evaluation", + inputs: dict[str, Any] | None = None, + evaluation_criterias: dict[str, Any] | None = None, +) -> EvaluationItem: + """Create an EvaluationItem for testing.""" + return EvaluationItem( + id=item_id, + name=name, + inputs=inputs or {}, + evaluation_criterias=evaluation_criterias or {}, + ) + + +# --- Test classes --- class TestEvalSetRunSpan: - """Tests for the 'Evaluation Set Run' span.""" - - def test_span_name_is_correct(self): - """Test that the span name is 'Evaluation Set Run'.""" - # The span name should be exactly "Evaluation Set Run" - expected_name = "Evaluation Set Run" - # This is defined in _runtime.py:316 - assert expected_name == "Evaluation Set Run" - - def test_span_has_eval_set_run_span_type(self): - """Test that span_type attribute is 'eval_set_run'.""" - span_attributes = {"span_type": "eval_set_run"} - assert span_attributes["span_type"] == "eval_set_run" - - def test_span_has_output_attribute(self): - """Test that span has output attribute with score.""" - import json - - # Simulate the output attribute set by configure_eval_set_run_span - output_data = {"score": 85} - output_json = json.dumps(output_data) - - span_attributes = { - "span_type": "eval_set_run", - "output": output_json, - } - - assert "output" in span_attributes - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["score"] == 85 - assert isinstance(parsed_output["score"], int) - - def test_span_has_agent_id(self): - """Test that span has agentId metadata attribute.""" - execution_id = "exec-123" - span_attributes = { - "span_type": "eval_set_run", - "agentId": execution_id, - } - assert "agentId" in span_attributes - assert span_attributes["agentId"] == "exec-123" - - def test_span_has_agent_name(self): - """Test that span has agentName metadata attribute.""" - span_attributes = { - "span_type": "eval_set_run", - "agentName": "N/A", - } - assert "agentName" in span_attributes - assert span_attributes["agentName"] == "N/A" - - def test_span_has_input_schema(self): - """Test that span has inputSchema metadata attribute.""" - import json - - input_schema = {"type": "object", "properties": {"x": {"type": "number"}}} - span_attributes = { - "span_type": "eval_set_run", - "inputSchema": json.dumps(input_schema), - } - assert "inputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["inputSchema"]) - assert parsed_schema["type"] == "object" - - def test_span_has_output_schema(self): - """Test that span has outputSchema metadata attribute.""" - import json - - output_schema = {"type": "string"} - span_attributes = { - "span_type": "eval_set_run", - "outputSchema": json.dumps(output_schema), - } - assert "outputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["outputSchema"]) - assert parsed_schema["type"] == "string" - - def test_span_includes_eval_set_run_id_when_present(self): - """Test that eval_set_run_id is included when context has it.""" - eval_set_run_id = str(uuid.uuid4()) - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if eval_set_run_id: - span_attributes["eval_set_run_id"] = eval_set_run_id - - assert "eval_set_run_id" in span_attributes - assert span_attributes["eval_set_run_id"] == eval_set_run_id - - def test_span_excludes_eval_set_run_id_when_not_present(self): - """Test that eval_set_run_id is not included when context doesn't have it.""" - eval_set_run_id = None - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if eval_set_run_id: - span_attributes["eval_set_run_id"] = eval_set_run_id - - assert "eval_set_run_id" not in span_attributes + """Tests that runtime.execute() creates the 'Evaluation Set Run' span.""" + + @pytest.mark.asyncio + async def test_execute_creates_eval_set_run_span(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock( + return_value=( + MagicMock(name="Test Set", evaluations=[]), + [], + iter([]), + ) + ), + ): + try: + await runtime.execute() + except Exception: + pass + + spans = tracer.get_spans_by_type("eval_set_run") + assert len(spans) >= 1 + assert spans[0].name == "Evaluation Set Run" + + @pytest.mark.asyncio + async def test_eval_set_run_id_included_when_provided(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer, eval_set_run_id="custom-run-abc") + + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock( + return_value=( + MagicMock(name="Test Set", evaluations=[]), + [], + iter([]), + ) + ), + ): + try: + await runtime.execute() + except Exception: + pass + + span = tracer.get_spans_by_type("eval_set_run")[0] + assert span.attributes["eval_set_run_id"] == "custom-run-abc" + + @pytest.mark.asyncio + async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock( + return_value=( + MagicMock(name="Test Set", evaluations=[]), + [], + iter([]), + ) + ), + ): + try: + await runtime.execute() + except Exception: + pass + + span = tracer.get_spans_by_type("eval_set_run")[0] + assert "eval_set_run_id" not in span.attributes + + @pytest.mark.asyncio + async def test_eval_set_run_span_has_custom_instrumentation_flag(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock( + return_value=( + MagicMock(name="Test Set", evaluations=[]), + [], + iter([]), + ) + ), + ): + try: + await runtime.execute() + except Exception: + pass + + span = tracer.get_spans_by_type("eval_set_run")[0] + assert span.attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_eval_set_run_span_configured_with_metadata(self) -> None: + """After evaluations complete, span gets agentId, agentName, schemas. + + This requires the full pipeline to complete (initiate_evaluation -> + execute_parallel -> compute_evaluator_scores -> configure_eval_set_run_span). + We mock execute_parallel to return an empty list so the pipeline completes. + """ + from uipath.eval.runtime.runtime import execute_parallel + + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + + eval_set = MagicMock() + eval_set.name = "Test Set" + eval_set.evaluations = [] + + with patch.object( + runtime, + "initiate_evaluation", + new=AsyncMock( + return_value=(eval_set, [], iter([])) + ), + ), patch( + "uipath.eval.runtime.runtime.execute_parallel", + new=AsyncMock(return_value=[]), + ): + await runtime.execute() + + span = tracer.get_spans_by_type("eval_set_run")[0] + # configure_eval_set_run_span sets these via set_attribute + assert span.attributes.get("agentName") == "N/A" + assert "agentId" in span.attributes + assert "inputSchema" in span.attributes + assert "outputSchema" in span.attributes class TestEvaluationSpan: - """Tests for the 'Evaluation' span.""" - - def test_span_name_is_correct(self): - """Test that the span name is 'Evaluation'.""" - expected_name = "Evaluation" - assert expected_name == "Evaluation" - - def test_span_has_evaluation_span_type(self): - """Test that span_type attribute is 'evaluation'.""" - span_attributes = {"span_type": "evaluation"} - assert span_attributes["span_type"] == "evaluation" - - def test_span_includes_execution_id(self): - """Test that execution.id is included in the span attributes.""" - execution_id = str(uuid.uuid4()) - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - } - assert "execution.id" in span_attributes - assert span_attributes["execution.id"] == execution_id - - def test_span_includes_eval_item_id(self): - """Test that eval_item_id is included in the span attributes.""" - eval_item_id = "test-eval-item-123" - span_attributes = { - "span_type": "evaluation", - "eval_item_id": eval_item_id, - } - assert "eval_item_id" in span_attributes - assert span_attributes["eval_item_id"] == eval_item_id - - def test_span_includes_eval_item_name(self): - """Test that eval_item_name is included in the span attributes.""" - eval_item_name = "Test Evaluation Item" - span_attributes = { - "span_type": "evaluation", - "eval_item_name": eval_item_name, - } - assert "eval_item_name" in span_attributes - assert span_attributes["eval_item_name"] == eval_item_name - - def test_span_has_all_required_attributes(self): - """Test that all required attributes are present in the span.""" - execution_id = str(uuid.uuid4()) - eval_item_id = "eval-item-456" - eval_item_name = "My Eval Item" - - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": eval_item_id, - "eval_item_name": eval_item_name, - } - - # Verify all required attributes - required_attrs = ["execution.id", "span_type", "eval_item_id", "eval_item_name"] - for attr in required_attrs: - assert attr in span_attributes, f"Missing required attribute: {attr}" - - def test_span_has_output_attribute(self): - """Test that span has output attribute with score.""" - import json - - # Simulate the output attribute set by configure_evaluation_span - output_data = {"score": 90} - output_json = json.dumps(output_data) - - span_attributes = { - "span_type": "evaluation", - "output": output_json, - } - - assert "output" in span_attributes - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["score"] == 90 - assert isinstance(parsed_output["score"], int) - - def test_span_has_agent_id(self): - """Test that span has agentId metadata attribute.""" - execution_id = "eval-exec-456" - span_attributes = { - "span_type": "evaluation", - "agentId": execution_id, - } - assert "agentId" in span_attributes - assert span_attributes["agentId"] == "eval-exec-456" - - def test_span_has_agent_name(self): - """Test that span has agentName metadata attribute.""" - span_attributes = { - "span_type": "evaluation", - "agentName": "N/A", - } - assert "agentName" in span_attributes - assert span_attributes["agentName"] == "N/A" - - def test_span_has_input_schema(self): - """Test that span has inputSchema metadata attribute.""" - import json - - input_schema = {"type": "object"} - span_attributes = { - "span_type": "evaluation", - "inputSchema": json.dumps(input_schema), - } - assert "inputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["inputSchema"]) - assert parsed_schema["type"] == "object" - - def test_span_has_output_schema(self): - """Test that span has outputSchema metadata attribute.""" - import json - - output_schema = {"type": "object"} - span_attributes = { - "span_type": "evaluation", - "outputSchema": json.dumps(output_schema), - } - assert "outputSchema" in span_attributes - parsed_schema = json.loads(span_attributes["outputSchema"]) - assert parsed_schema["type"] == "object" + """Tests that runtime._execute_eval() creates the 'Evaluation' span.""" + + @pytest.mark.asyncio + async def test_execute_eval_creates_evaluation_span(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + eval_item = make_eval_item() + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime._execute_eval(eval_item, []) + + spans = tracer.get_spans_by_type("evaluation") + assert len(spans) == 1 + assert spans[0].name == "Evaluation" + + @pytest.mark.asyncio + async def test_evaluation_span_has_eval_item_attributes(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + eval_item = make_eval_item(item_id="my-item-99", name="My Special Eval") + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime._execute_eval(eval_item, []) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span.attributes["eval_item_id"] == "my-item-99" + assert span.attributes["eval_item_name"] == "My Special Eval" + + @pytest.mark.asyncio + async def test_evaluation_span_has_execution_id_from_eval_item(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + eval_item = make_eval_item(item_id="item-abc") + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime._execute_eval(eval_item, []) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span.attributes["execution.id"] == "item-abc" + + @pytest.mark.asyncio + async def test_evaluation_span_has_custom_instrumentation_flag(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + eval_item = make_eval_item() + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime._execute_eval(eval_item, []) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span.attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_evaluation_span_configured_with_scores_after_evaluators( + self, + ) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + + evaluator = make_evaluator( + name="Accuracy", evaluator_id="acc-eval", score=0.85 + ) + eval_item = make_eval_item(evaluation_criterias={"acc-eval": {}}) + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime._execute_eval(eval_item, [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + assert "output" in span.attributes + output = json.loads(span.attributes["output"]) + assert "scores" in output + assert "Accuracy" in output["scores"] + + @pytest.mark.asyncio + async def test_evaluation_span_has_metadata_after_execution(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + eval_item = make_eval_item(inputs={"query": "test"}) + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime._execute_eval(eval_item, []) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span.attributes.get("agentName") == "N/A" + assert "agentId" in span.attributes class TestEvaluatorSpan: - """Tests for the 'Evaluator: {name}' span.""" - - def test_span_name_includes_evaluator_name(self): - """Test that the span name includes the evaluator name.""" - evaluator_name = "MyEvaluator" - expected_name = f"Evaluator: {evaluator_name}" - assert expected_name == "Evaluator: MyEvaluator" - - def test_span_has_evaluator_span_type(self): - """Test that span_type attribute is 'evaluator'.""" - span_attributes = {"span_type": "evaluator"} - assert span_attributes["span_type"] == "evaluator" - - def test_span_includes_evaluator_id(self): - """Test that evaluator_id is included in the span attributes.""" - evaluator_id = "evaluator-789" - span_attributes = { - "span_type": "evaluator", - "evaluator_id": evaluator_id, - } - assert "evaluator_id" in span_attributes - assert span_attributes["evaluator_id"] == evaluator_id - - def test_span_includes_evaluator_name(self): - """Test that evaluator_name is included in the span attributes.""" - evaluator_name = "AccuracyEvaluator" - span_attributes = { - "span_type": "evaluator", - "evaluator_name": evaluator_name, - } - assert "evaluator_name" in span_attributes - assert span_attributes["evaluator_name"] == evaluator_name - - def test_span_includes_eval_item_id(self): - """Test that eval_item_id is included in the evaluator span.""" - eval_item_id = "eval-item-123" - span_attributes = { - "span_type": "evaluator", - "eval_item_id": eval_item_id, - } - assert "eval_item_id" in span_attributes - assert span_attributes["eval_item_id"] == eval_item_id - - def test_span_has_all_required_attributes(self): - """Test that all required attributes are present in the evaluator span.""" - evaluator_id = "eval-id-123" - evaluator_name = "TestEvaluator" - eval_item_id = "item-456" - - span_attributes = { - "span_type": "evaluator", - "evaluator_id": evaluator_id, - "evaluator_name": evaluator_name, - "eval_item_id": eval_item_id, - } - - # Verify all required attributes - required_attrs = ["span_type", "evaluator_id", "evaluator_name", "eval_item_id"] - for attr in required_attrs: - assert attr in span_attributes, f"Missing required attribute: {attr}" - - -class TestSpanHierarchy: - """Tests verifying the span hierarchy structure.""" - - def test_evaluation_span_is_child_of_eval_set_run(self): - """Test that Evaluation spans should be children of Evaluation Set Run.""" - # This is a conceptual test - in the actual code, the Evaluation span - # is created inside the context of the Evaluation Set Run span - parent_span_type = "eval_set_run" - child_span_type = "evaluation" - - # The parent-child relationship is enforced by span context nesting - assert parent_span_type == "eval_set_run" - assert child_span_type == "evaluation" - - def test_evaluator_span_is_child_of_evaluation(self): - """Test that Evaluator spans should be children of Evaluation.""" - # This is a conceptual test - in the actual code, the Evaluator span - # is created inside the context of the Evaluation span - parent_span_type = "evaluation" - child_span_type = "evaluator" - - assert parent_span_type == "evaluation" - assert child_span_type == "evaluator" - - -class TestSpanAttributeValues: - """Tests for span attribute value formatting.""" - - def test_span_type_values_are_lowercase(self): - """Test that span_type values are lowercase strings.""" - span_types = ["eval_set_run", "evaluation", "evaluator"] - - for span_type in span_types: - assert span_type == span_type.lower() - # All span types should be lowercase without hyphens - assert "-" not in span_type - - def test_execution_id_is_valid_uuid(self): - """Test that execution.id is a valid UUID string.""" - execution_id = str(uuid.uuid4()) - - # Verify it can be parsed back as a UUID - parsed_uuid = uuid.UUID(execution_id) - assert str(parsed_uuid) == execution_id - - def test_evaluator_span_name_format(self): - """Test the evaluator span name format.""" - evaluator_names = [ - "Accuracy", - "Relevance", - "Fluency", - "Custom Evaluator", - ] - - for name in evaluator_names: - span_name = f"Evaluator: {name}" - assert span_name.startswith("Evaluator: ") - assert name in span_name - - -class TestEvalContextIntegration: - """Tests for UiPathEvalContext integration with spans.""" - - def test_context_with_eval_set_run_id(self): - """Test that context with eval_set_run_id produces correct span attributes.""" - context = UiPathEvalContext() - context.eval_set_run_id = "run-123" - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if context.eval_set_run_id: - span_attributes["eval_set_run_id"] = context.eval_set_run_id - - assert span_attributes["eval_set_run_id"] == "run-123" - - def test_context_without_eval_set_run_id(self): - """Test that context without eval_set_run_id produces correct span attributes.""" - context = UiPathEvalContext() - context.eval_set_run_id = None - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if context.eval_set_run_id: - span_attributes["eval_set_run_id"] = context.eval_set_run_id - - assert "eval_set_run_id" not in span_attributes - - -class TestSpanCreationLogic: - """Tests for the span creation logic in runtime methods.""" - - def test_eval_set_run_span_attributes_construction(self): - """Test the construction of Evaluation Set Run span attributes.""" - eval_set_run_id = "test-run-id" - - span_attributes: Dict[str, str] = {"span_type": "eval_set_run"} - if eval_set_run_id: - span_attributes["eval_set_run_id"] = eval_set_run_id - - assert span_attributes == { - "span_type": "eval_set_run", - "eval_set_run_id": "test-run-id", - } + """Tests that runtime.run_evaluator() creates the 'Evaluator: {name}' span.""" + + @pytest.mark.asyncio + async def test_run_evaluator_creates_evaluator_span(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(name="AccuracyEvaluator", evaluator_id="acc-1") + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - def test_evaluation_span_attributes_construction(self): - """Test the construction of Evaluation span attributes.""" - execution_id = "exec-123" - eval_item_id = "item-456" - eval_item_name = "Test Item" + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 1 + assert spans[0].name == "Evaluator: AccuracyEvaluator" - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": eval_item_id, - "eval_item_name": eval_item_name, - } + @pytest.mark.asyncio + async def test_evaluator_span_has_correct_attributes(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator( + name="RelevanceEvaluator", evaluator_id="rel-eval-42" + ) + eval_item = make_eval_item(item_id="eval-item-77") + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - assert span_attributes["execution.id"] == "exec-123" - assert span_attributes["span_type"] == "evaluation" - assert span_attributes["eval_item_id"] == "item-456" - assert span_attributes["eval_item_name"] == "Test Item" + span = tracer.get_spans_by_type("evaluator")[0] + assert span.attributes["evaluator_id"] == "rel-eval-42" + assert span.attributes["evaluator_name"] == "RelevanceEvaluator" + assert span.attributes["eval_item_id"] == "eval-item-77" + assert span.attributes["uipath.custom_instrumentation"] is True + + @pytest.mark.asyncio + async def test_multiple_evaluators_create_separate_spans(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + names = ["Accuracy", "Relevance", "Fluency"] + for name in names: + evaluator = make_evaluator(name=name, evaluator_id=f"{name.lower()}-id") + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) + + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 3 + span_names = [s.name for s in spans] + assert "Evaluator: Accuracy" in span_names + assert "Evaluator: Relevance" in span_names + assert "Evaluator: Fluency" in span_names - def test_evaluator_span_attributes_construction(self): - """Test the construction of Evaluator span attributes.""" - evaluator_id = "eval-123" - evaluator_name = "AccuracyEvaluator" - eval_item_id = "item-789" - - span_attributes = { - "span_type": "evaluator", - "evaluator_id": evaluator_id, - "evaluator_name": evaluator_name, - "eval_item_id": eval_item_id, - } - - assert span_attributes["span_type"] == "evaluator" - assert span_attributes["evaluator_id"] == "eval-123" - assert span_attributes["evaluator_name"] == "AccuracyEvaluator" - assert span_attributes["eval_item_id"] == "item-789" - - def test_evaluator_span_name_construction(self): - """Test the construction of Evaluator span name.""" - evaluator_name = "RelevanceEvaluator" - span_name = f"Evaluator: {evaluator_name}" - - assert span_name == "Evaluator: RelevanceEvaluator" - - -class TestEvalItemSpanAttributes: - """Tests for eval item attributes in spans.""" - - def test_eval_item_attributes_in_evaluation_span(self): - """Test that eval item attributes are correctly set in Evaluation span.""" - eval_item = MagicMock(spec=EvaluationItem) - eval_item.id = "item-id-123" - eval_item.name = "Test Evaluation" - - span_attributes = { - "execution.id": str(uuid.uuid4()), - "span_type": "evaluation", - "eval_item_id": eval_item.id, - "eval_item_name": eval_item.name, - } - - assert span_attributes["eval_item_id"] == "item-id-123" - assert span_attributes["eval_item_name"] == "Test Evaluation" - - def test_eval_item_id_in_evaluator_span(self): - """Test that eval_item_id is included in Evaluator span.""" - eval_item = MagicMock(spec=EvaluationItem) - eval_item.id = "item-id-456" - - span_attributes = { - "span_type": "evaluator", - "evaluator_id": "evaluator-123", - "evaluator_name": "TestEvaluator", - "eval_item_id": eval_item.id, - } - - assert span_attributes["eval_item_id"] == "item-id-456" - - -class TestSpanTypeConsistency: - """Tests for span type value consistency.""" - - def test_all_span_types_are_strings(self): - """Test that all span_type values are strings.""" - span_types = ["eval_set_run", "evaluation", "evaluator"] - - for span_type in span_types: - assert isinstance(span_type, str) - - def test_span_types_use_snake_case(self): - """Test that span_type values use snake_case naming.""" - span_types = ["eval_set_run", "evaluation", "evaluator"] - - for span_type in span_types: - # No uppercase letters - assert span_type == span_type.lower() - # No hyphens - assert "-" not in span_type - - def test_span_type_values_match_expected(self): - """Test that span_type values match expected values from _runtime.py.""" - expected_span_types = { - "Evaluation Set Run": "eval_set_run", - "Evaluation": "evaluation", - "Evaluator": "evaluator", - } - - for _, span_type in expected_span_types.items(): - assert isinstance(span_type, str) - assert span_type.islower() or "_" in span_type - - -class TestRunEvaluatorSpan: - """Tests specifically for the run_evaluator span creation.""" - - @pytest.fixture - def mock_evaluator(self): - """Create a mock evaluator for testing.""" - evaluator = MagicMock(spec=BaseEvaluator) - evaluator.id = "test-evaluator-id" - evaluator.name = "TestEvaluator" - return evaluator - - @pytest.fixture - def mock_eval_item(self): - """Create a mock eval item for testing.""" - eval_item = MagicMock(spec=EvaluationItem) - eval_item.id = "test-item-id" - eval_item.name = "Test Item" - eval_item.inputs = {"query": "test query"} - eval_item.expected_agent_behavior = "Expected behavior" - return eval_item - - def test_evaluator_span_name_uses_evaluator_name(self, mock_evaluator): - """Test that evaluator span name uses the evaluator's name.""" - span_name = f"Evaluator: {mock_evaluator.name}" - assert span_name == "Evaluator: TestEvaluator" - - def test_evaluator_span_includes_evaluator_details( - self, mock_evaluator, mock_eval_item - ): - """Test that evaluator span includes all evaluator details.""" - span_attributes = { - "span_type": "evaluator", - "evaluator_id": mock_evaluator.id, - "evaluator_name": mock_evaluator.name, - "eval_item_id": mock_eval_item.id, - } - assert span_attributes["evaluator_id"] == "test-evaluator-id" - assert span_attributes["evaluator_name"] == "TestEvaluator" - assert span_attributes["eval_item_id"] == "test-item-id" +class TestEvaluationOutputSpan: + """Tests that run_evaluator() creates the child 'Evaluation output' span.""" + + @pytest.mark.asyncio + async def test_run_evaluator_creates_eval_output_span(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(score=0.9) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) + output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") + assert len(output_spans) == 1 + assert output_spans[0].name == "Evaluation output" -class TestExecutionIdPropagation: - """Tests for execution.id propagation in spans.""" + @pytest.mark.asyncio + async def test_eval_output_span_has_score_and_evaluator_id(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator( + evaluator_id="my-eval-id", score=0.75 + ) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - def test_execution_id_format(self): - """Test that execution.id is in valid UUID format.""" - execution_id = str(uuid.uuid4()) + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert span.attributes["value"] == 0.75 + assert span.attributes["evaluatorId"] == "my-eval-id" + + @pytest.mark.asyncio + async def test_eval_output_span_has_openinference_kind(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator() + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - # Verify it's a valid UUID - try: - uuid.UUID(execution_id) - valid = True - except ValueError: - valid = False + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert span.attributes["openinference.span.kind"] == "CHAIN" + + @pytest.mark.asyncio + async def test_eval_output_span_justification_from_pydantic_details(self) -> None: + class EvalDetails(BaseModel): + justification: str + extra: str = "ignored" + + details = EvalDetails(justification="Semantically equivalent output") + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(score=0.92, details=details) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - assert valid + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert span.attributes["justification"] == "Semantically equivalent output" + + @pytest.mark.asyncio + async def test_eval_output_span_justification_from_string_details(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(score=0.8, details="Good accuracy overall") + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - def test_execution_id_is_unique_per_eval(self): - """Test that each eval gets a unique execution_id.""" - execution_ids = [str(uuid.uuid4()) for _ in range(5)] + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert span.attributes["justification"] == "Good accuracy overall" + + @pytest.mark.asyncio + async def test_eval_output_span_no_justification_when_no_details(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(score=1.0, details=None) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - # All should be unique - assert len(set(execution_ids)) == 5 + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + assert "justification" not in span.attributes + + @pytest.mark.asyncio + async def test_eval_output_span_output_has_normalized_score(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(score=0.85) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - def test_evaluation_span_has_execution_id(self): - """Test that Evaluation span includes execution.id.""" - execution_id = str(uuid.uuid4()) + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + output = json.loads(span.attributes["output"]) + # 0.85 normalized to 0-100 range + assert output["score"] == 85.0 + + @pytest.mark.asyncio + async def test_eval_output_span_output_type_always_one(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(score=0.5) + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) - span_attributes = { - "execution.id": execution_id, - "span_type": "evaluation", - "eval_item_id": "item-123", - "eval_item_name": "Test Item", - } + span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + output = json.loads(span.attributes["output"]) + assert output["type"] == 1 - assert "execution.id" in span_attributes - assert span_attributes["execution.id"] == execution_id +class TestSpanHierarchy: + """Tests that spans are created in the correct order/nesting.""" + + @pytest.mark.asyncio + async def test_run_evaluator_creates_both_evaluator_and_output_spans(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(name="TestEval") + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) -class TestEvaluationOutputSpan: - """Tests for the 'Evaluation output' span.""" - - def test_span_name_is_correct(self): - """Test that the span name is 'Evaluation output'.""" - expected_name = "Evaluation output" - assert expected_name == "Evaluation output" - - def test_span_has_eval_output_span_type(self): - """Test that span_type attribute is 'evalOutput'.""" - span_attributes = {"span.type": "evalOutput"} - assert span_attributes["span.type"] == "evalOutput" - - def test_span_includes_value(self): - """Test that value (score) is included in the span attributes.""" - score = 0.95 - span_attributes = { - "span.type": "evalOutput", - "value": score, - } - assert "value" in span_attributes - assert span_attributes["value"] == 0.95 - - def test_span_includes_evaluator_id(self): - """Test that evaluatorId is included in the span attributes.""" - evaluator_id = "evaluator-123" - span_attributes = { - "span.type": "evalOutput", - "evaluatorId": evaluator_id, - } - assert "evaluatorId" in span_attributes - assert span_attributes["evaluatorId"] == evaluator_id - - def test_span_includes_justification(self): - """Test that justification is included in the span attributes.""" - justification = "The output matches expected behavior." - span_attributes = { - "span.type": "evalOutput", - "justification": justification, - } - assert "justification" in span_attributes - assert span_attributes["justification"] == justification - - def test_span_has_all_required_attributes(self): - """Test that all required attributes are present in the span.""" - evaluator_id = "eval-id-123" - score = 100 - justification = "Perfect match" - - span_attributes = { - "span.type": "evalOutput", - "value": score, - "evaluatorId": evaluator_id, - "justification": justification, - } - - # Verify all required attributes - required_attrs = ["span.type", "value", "evaluatorId", "justification"] - for attr in required_attrs: - assert attr in span_attributes, f"Missing required attribute: {attr}" - - def test_span_has_openinference_kind(self): - """Test that openinference.span.kind is set to CHAIN.""" - span_attributes = { - "openinference.span.kind": "CHAIN", - "span.type": "evalOutput", - } - assert span_attributes["openinference.span.kind"] == "CHAIN" - - def test_span_has_output_attribute_with_type_value_justification(self): - """Test that span has output attribute with type, value, and justification.""" - import json - - # Simulate the output attribute set by set_evaluation_output_span_output - output_data = { - "type": 1, - "value": 0.92, - "justification": "The outputs are semantically equivalent", - } - output_json = json.dumps(output_data) - - span_attributes = { - "span.type": "evalOutput", - "output": output_json, - } - - assert "output" in span_attributes - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["type"] == 1 - assert parsed_output["value"] == 0.92 - assert ( - parsed_output["justification"] == "The outputs are semantically equivalent" + # Should have both an evaluator span and an eval output span + evaluator_spans = tracer.get_spans_by_type("evaluator") + output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") + assert len(evaluator_spans) == 1 + assert len(output_spans) == 1 + + @pytest.mark.asyncio + async def test_eval_output_span_created_after_evaluator_span(self) -> None: + tracer = SpanCapturingTracer() + runtime = make_runtime(tracer) + evaluator = make_evaluator(name="OrderTest") + eval_item = make_eval_item() + execution_output = make_mock_execution_output() + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=execution_output, + eval_item=eval_item, + evaluation_criteria=None, ) - def test_span_output_type_is_always_one(self): - """Test that output type field is always 1.""" - import json - - output_data = {"type": 1, "value": 0.5} - output_json = json.dumps(output_data) - - span_attributes = { - "span.type": "evalOutput", - "output": output_json, - } - - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["type"] == 1 - - def test_span_output_without_justification(self): - """Test that output can be set without justification field.""" - import json - - # When justification is None, it should be excluded from output - output_data = {"type": 1, "value": 0.75} - output_json = json.dumps(output_data) - - span_attributes = { - "span.type": "evalOutput", - "output": output_json, - } - - parsed_output = json.loads(span_attributes["output"]) - assert parsed_output["type"] == 1 - assert parsed_output["value"] == 0.75 - assert "justification" not in parsed_output - - -class TestEvaluationOutputSpanHierarchy: - """Tests verifying the Evaluation output span hierarchy.""" - - def test_eval_output_is_child_of_evaluator(self): - """Test that Evaluation output spans should be children of Evaluator spans.""" - parent_span_type = "evaluator" - child_span_type = "evalOutput" - - # The parent-child relationship is enforced by span context nesting - assert parent_span_type == "evaluator" - assert child_span_type == "evalOutput" - - def test_eval_output_span_attributes_construction(self): - """Test the construction of Evaluation output span attributes.""" - evaluator_id = "eval-123" - score = 0.85 - justification = "Good accuracy" - - span_attributes = { - "openinference.span.kind": "CHAIN", - "span.type": "evalOutput", - "value": score, - "evaluatorId": evaluator_id, - "justification": justification, - } - - assert span_attributes["openinference.span.kind"] == "CHAIN" - assert span_attributes["span.type"] == "evalOutput" - assert span_attributes["value"] == 0.85 - assert span_attributes["evaluatorId"] == "eval-123" - assert span_attributes["justification"] == "Good accuracy" - - -class TestEvaluationOutputSpanCreation: - """Tests for Evaluation output span creation in progress reporter.""" - - def test_eval_output_span_name_is_evaluation_output(self): - """Test that the span name is exactly 'Evaluation output'.""" - span_name = "Evaluation output" - assert span_name == "Evaluation output" - - def test_eval_output_span_type_is_camel_case(self): - """Test that span.type uses camelCase: evalOutput.""" - span_type = "evalOutput" - assert span_type == "evalOutput" - # First letter lowercase, second word capitalized - assert span_type[0].islower() - assert "Output" in span_type - - def test_eval_output_with_pydantic_details(self): - """Test that justification is extracted from Pydantic model details.""" - # Simulate Pydantic model details with justification field - details_dict = { - "justification": "The semantic similarity is perfect.", - "other_field": "some value", - } - - # Extract justification like the code does - justification = details_dict.get("justification", str(details_dict)) - assert justification == "The semantic similarity is perfect." - - def test_eval_output_with_string_details(self): - """Test that string details are used as justification directly.""" - details = "Good accuracy on all test cases" - - # String details are used directly - justification = str(details) - assert justification == "Good accuracy on all test cases" - - def test_eval_output_without_justification_field(self): - """Test fallback when details dict has no justification field.""" - import json - - details_dict: dict[str, float] = { - "accuracy": 0.95, - "precision": 0.92, - } - - # Should fall back to JSON dump of entire details - # Since there's no "justification" key, we get the default JSON string - justification = json.dumps(details_dict) - assert "accuracy" in justification - assert "0.95" in justification + # In the captured list, the evaluator span should appear before the output span + span_names = [s.name for s in tracer.captured_spans] + evaluator_idx = span_names.index("Evaluator: OrderTest") + output_idx = span_names.index("Evaluation output") + assert evaluator_idx < output_idx From 5f1705c1d673dab00b2f5a7ba683e00da1812fcc Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 08:08:53 -0700 Subject: [PATCH 2/7] test: rewrite eval runtime span tests to exercise real code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All ~60 tests in test_eval_runtime_spans.py were tautological — they constructed dicts inline and asserted against those same dicts without ever calling production code. Replace with 25 tests that inject a SpanCapturingTracer into the runtime, call the real methods (execute, _execute_eval, run_evaluator), and assert on the captured spans. TestEvalSetRunSpan tests run the full pipeline: execute() flows real eval items through _execute_eval() and run_evaluator(), then verifies the parent span has aggregate scores and metadata. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 202 ++++++++++-------- 1 file changed, 117 insertions(+), 85 deletions(-) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index 0873b2ae7..35dcf89ef 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -162,6 +162,10 @@ def make_evaluator( evaluator.validate_and_evaluate_criteria = AsyncMock( return_value=NumericEvaluationResult(score=score, details=details) ) + # reduce_scores is called by compute_evaluator_scores to aggregate across items + evaluator.reduce_scores = lambda results: ( + sum(r.score for r in results) / len(results) if results else 0.0 + ) return evaluator @@ -184,140 +188,168 @@ def make_eval_item( class TestEvalSetRunSpan: - """Tests that runtime.execute() creates the 'Evaluation Set Run' span.""" + """Tests that runtime.execute() creates the 'Evaluation Set Run' span. + + These tests run the full pipeline: execute() -> initiate_evaluation() -> + _execute_eval() (per item) -> run_evaluator() (per evaluator). Only + execute_runtime (the actual agent invocation) is mocked. + """ + + def _make_runtime_with_evaluations( + self, + tracer: SpanCapturingTracer, + eval_items: list[EvaluationItem], + evaluators: list[MagicMock], + **context_kwargs: Any, + ) -> UiPathEvalRuntime: + """Create a runtime whose context has real eval items and evaluators.""" + eval_set = EvaluationSet( + id="test-set", + name="Test Eval Set", + evaluations=eval_items, + ) + return make_runtime( + tracer, + evaluation_set=eval_set, + evaluators=evaluators, + **context_kwargs, + ) @pytest.mark.asyncio async def test_execute_creates_eval_set_run_span(self) -> None: + """execute() with one eval item produces an 'Evaluation Set Run' span.""" tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + eval_item = make_eval_item(evaluation_criterias={"acc": {}}) + runtime = self._make_runtime_with_evaluations( + tracer, [eval_item], [evaluator] + ) with patch.object( runtime, - "initiate_evaluation", - new=AsyncMock( - return_value=( - MagicMock(name="Test Set", evaluations=[]), - [], - iter([]), - ) - ), + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), ): - try: - await runtime.execute() - except Exception: - pass + await runtime.execute() spans = tracer.get_spans_by_type("eval_set_run") - assert len(spans) >= 1 + assert len(spans) == 1 assert spans[0].name == "Evaluation Set Run" @pytest.mark.asyncio - async def test_eval_set_run_id_included_when_provided(self) -> None: + async def test_eval_set_run_span_has_aggregate_scores(self) -> None: + """After evaluations complete, the span output contains aggregate scores.""" tracer = SpanCapturingTracer() - runtime = make_runtime(tracer, eval_set_run_id="custom-run-abc") + evaluator = make_evaluator(name="Accuracy", evaluator_id="acc", score=0.8) + items = [ + make_eval_item(item_id="item-1", name="E1", evaluation_criterias={"acc": {}}), + make_eval_item(item_id="item-2", name="E2", evaluation_criterias={"acc": {}}), + ] + runtime = self._make_runtime_with_evaluations(tracer, items, [evaluator]) with patch.object( runtime, - "initiate_evaluation", - new=AsyncMock( - return_value=( - MagicMock(name="Test Set", evaluations=[]), - [], - iter([]), - ) - ), + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), ): - try: - await runtime.execute() - except Exception: - pass + await runtime.execute() span = tracer.get_spans_by_type("eval_set_run")[0] - assert span.attributes["eval_set_run_id"] == "custom-run-abc" + output = json.loads(span.attributes["output"]) + assert "scores" in output + assert "Accuracy" in output["scores"] + # 0.8 normalized to 80.0 + assert output["scores"]["Accuracy"] == 80.0 @pytest.mark.asyncio - async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: + async def test_eval_set_run_span_has_metadata(self) -> None: + """After evaluations complete, span has agentId, agentName, schemas.""" tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=1.0) + eval_item = make_eval_item(evaluation_criterias={"acc": {}}) + runtime = self._make_runtime_with_evaluations( + tracer, [eval_item], [evaluator] + ) with patch.object( runtime, - "initiate_evaluation", - new=AsyncMock( - return_value=( - MagicMock(name="Test Set", evaluations=[]), - [], - iter([]), - ) - ), + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), ): - try: - await runtime.execute() - except Exception: - pass + await runtime.execute() span = tracer.get_spans_by_type("eval_set_run")[0] - assert "eval_set_run_id" not in span.attributes + assert span.attributes["agentName"] == "N/A" + assert "agentId" in span.attributes + assert "inputSchema" in span.attributes + assert "outputSchema" in span.attributes @pytest.mark.asyncio - async def test_eval_set_run_span_has_custom_instrumentation_flag(self) -> None: + async def test_eval_set_run_id_included_when_provided(self) -> None: tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + eval_item = make_eval_item(evaluation_criterias={"acc": {}}) + runtime = self._make_runtime_with_evaluations( + tracer, [eval_item], [evaluator], eval_set_run_id="custom-run-abc" + ) with patch.object( runtime, - "initiate_evaluation", - new=AsyncMock( - return_value=( - MagicMock(name="Test Set", evaluations=[]), - [], - iter([]), - ) - ), + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), ): - try: - await runtime.execute() - except Exception: - pass + await runtime.execute() span = tracer.get_spans_by_type("eval_set_run")[0] - assert span.attributes["uipath.custom_instrumentation"] is True + assert span.attributes["eval_set_run_id"] == "custom-run-abc" @pytest.mark.asyncio - async def test_eval_set_run_span_configured_with_metadata(self) -> None: - """After evaluations complete, span gets agentId, agentName, schemas. + async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: + tracer = SpanCapturingTracer() + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + eval_item = make_eval_item(evaluation_criterias={"acc": {}}) + runtime = self._make_runtime_with_evaluations( + tracer, [eval_item], [evaluator] + ) - This requires the full pipeline to complete (initiate_evaluation -> - execute_parallel -> compute_evaluator_scores -> configure_eval_set_run_span). - We mock execute_parallel to return an empty list so the pipeline completes. - """ - from uipath.eval.runtime.runtime import execute_parallel + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), + ): + await runtime.execute() - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) + span = tracer.get_spans_by_type("eval_set_run")[0] + assert "eval_set_run_id" not in span.attributes - eval_set = MagicMock() - eval_set.name = "Test Set" - eval_set.evaluations = [] + @pytest.mark.asyncio + async def test_execute_produces_full_span_hierarchy(self) -> None: + """execute() with eval items produces the full span tree: + Evaluation Set Run -> Evaluation -> Evaluator -> Evaluation output. + """ + tracer = SpanCapturingTracer() + evaluator = make_evaluator( + name="Relevance", evaluator_id="rel", score=0.95 + ) + eval_item = make_eval_item( + item_id="item-1", name="E1", evaluation_criterias={"rel": {}} + ) + runtime = self._make_runtime_with_evaluations( + tracer, [eval_item], [evaluator] + ) with patch.object( runtime, - "initiate_evaluation", - new=AsyncMock( - return_value=(eval_set, [], iter([])) - ), - ), patch( - "uipath.eval.runtime.runtime.execute_parallel", - new=AsyncMock(return_value=[]), + "execute_runtime", + new=AsyncMock(return_value=make_mock_execution_output()), ): await runtime.execute() - span = tracer.get_spans_by_type("eval_set_run")[0] - # configure_eval_set_run_span sets these via set_attribute - assert span.attributes.get("agentName") == "N/A" - assert "agentId" in span.attributes - assert "inputSchema" in span.attributes - assert "outputSchema" in span.attributes + # All four span types should be present + assert len(tracer.get_spans_by_type("eval_set_run")) == 1 + assert len(tracer.get_spans_by_type("evaluation")) == 1 + assert len(tracer.get_spans_by_type("evaluator")) == 1 + assert len(tracer.get_spans_by_attr("span.type", "evalOutput")) == 1 class TestEvaluationSpan: From 388f2dc49a66ebd7cd47fc92bec2688a598e9aac Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 09:32:57 -0700 Subject: [PATCH 3/7] test: all span tests go through execute() with real evaluations Every test now runs the full pipeline: execute() -> initiate_evaluation() -> _execute_eval() per item -> run_evaluator() per evaluator. Only execute_runtime (the agent invocation) is mocked. Each test class asserts on a different level of the span tree that the evaluation produces, rather than testing isolated methods disconnected from the pipeline they belong to. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 732 ++++++------------ 1 file changed, 257 insertions(+), 475 deletions(-) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index 35dcf89ef..7154f4ba3 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -1,10 +1,14 @@ """Tests for eval runtime span creation. -Verifies that the eval runtime methods produce the correct OpenTelemetry spans: -1. "Evaluation Set Run" - span_type: "eval_set_run" (from execute()) -2. "Evaluation" - span_type: "evaluation" (from _execute_eval()) -3. "Evaluator: {name}" - span_type: "evaluator" (from run_evaluator()) -4. "Evaluation output" - span_type: "evalOutput" (from run_evaluator()) +Verifies that running evaluations produces the correct OpenTelemetry span tree: + + "Evaluation Set Run" (span_type: "eval_set_run") + └── "Evaluation" (span_type: "evaluation") — one per eval item + └── "Evaluator: {name}" (span_type: "evaluator") — one per evaluator + └── "Evaluation output" (span.type: "evalOutput") — the score + +Every test runs the full pipeline via execute(). Only execute_runtime (the +actual agent invocation) is mocked — everything else runs for real. """ import json @@ -78,35 +82,6 @@ def get_span_by_name(self, name: str) -> MockSpan | None: return None -def create_eval_context(**kwargs: Any) -> UiPathEvalContext: - """Create UiPathEvalContext with sensible defaults, overridable via kwargs.""" - context = UiPathEvalContext() - - if "execution_id" not in kwargs: - context.execution_id = str(uuid.uuid4()) - if "runtime_schema" not in kwargs: - context.runtime_schema = UiPathRuntimeSchema( - filePath="test.py", - uniqueId="test", - type="workflow", - input={"type": "object", "properties": {}}, - output={"type": "object", "properties": {}}, - ) - if "evaluation_set" not in kwargs: - context.evaluation_set = EvaluationSet( - id="test-eval-set", - name="Test Evaluation Set", - evaluations=[], - ) - if "evaluators" not in kwargs: - context.evaluators = [] - - for key, value in kwargs.items(): - setattr(context, key, value) - - return context - - def make_mock_execution_output( output: dict[str, Any] | None = None, error: Any = None, @@ -125,30 +100,6 @@ def make_mock_execution_output( return mock -def make_runtime( - capturing_tracer: SpanCapturingTracer, **context_kwargs: Any -) -> UiPathEvalRuntime: - """Create a UiPathEvalRuntime wired to a SpanCapturingTracer.""" - mock_trace_manager = MagicMock() - mock_trace_manager.tracer_provider.get_tracer.return_value = capturing_tracer - mock_trace_manager.tracer_span_processors = [] - - mock_factory = MagicMock() - mock_factory.new_runtime = AsyncMock(return_value=AsyncMock()) - - mock_event_bus = MagicMock() - mock_event_bus.publish = AsyncMock() - - context = create_eval_context(**context_kwargs) - - return UiPathEvalRuntime( - context=context, - factory=mock_factory, - trace_manager=mock_trace_manager, - event_bus=mock_event_bus, - ) - - def make_evaluator( name: str = "AccuracyEvaluator", evaluator_id: str = "accuracy-eval", @@ -184,99 +135,110 @@ def make_eval_item( ) -# --- Test classes --- +async def run_evaluation( + eval_items: list[EvaluationItem], + evaluators: list[MagicMock], + execution_output: MagicMock | None = None, + **context_kwargs: Any, +) -> tuple[SpanCapturingTracer, UiPathEvalRuntime]: + """Run execute() through the full pipeline and return the tracer + runtime. + Sets up the runtime with real eval items and evaluators, mocks only + execute_runtime, and runs execute() to completion. + """ + tracer = SpanCapturingTracer() -class TestEvalSetRunSpan: - """Tests that runtime.execute() creates the 'Evaluation Set Run' span. + mock_trace_manager = MagicMock() + mock_trace_manager.tracer_provider.get_tracer.return_value = tracer + mock_trace_manager.tracer_span_processors = [] - These tests run the full pipeline: execute() -> initiate_evaluation() -> - _execute_eval() (per item) -> run_evaluator() (per evaluator). Only - execute_runtime (the actual agent invocation) is mocked. - """ + mock_factory = MagicMock() + mock_factory.new_runtime = AsyncMock(return_value=AsyncMock()) - def _make_runtime_with_evaluations( - self, - tracer: SpanCapturingTracer, - eval_items: list[EvaluationItem], - evaluators: list[MagicMock], - **context_kwargs: Any, - ) -> UiPathEvalRuntime: - """Create a runtime whose context has real eval items and evaluators.""" - eval_set = EvaluationSet( - id="test-set", - name="Test Eval Set", - evaluations=eval_items, - ) - return make_runtime( - tracer, - evaluation_set=eval_set, - evaluators=evaluators, - **context_kwargs, - ) + mock_event_bus = MagicMock() + mock_event_bus.publish = AsyncMock() + + context = UiPathEvalContext() + context.execution_id = context_kwargs.pop( + "execution_id", str(uuid.uuid4()) + ) + context.runtime_schema = context_kwargs.pop( + "runtime_schema", + UiPathRuntimeSchema( + filePath="test.py", + uniqueId="test", + type="workflow", + input={"type": "object", "properties": {"x": {"type": "number"}}}, + output={"type": "object", "properties": {}}, + ), + ) + context.evaluation_set = EvaluationSet( + id="test-set", + name="Test Eval Set", + evaluations=eval_items, + ) + context.evaluators = evaluators + + for key, value in context_kwargs.items(): + setattr(context, key, value) + + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) + + with patch.object( + runtime, + "execute_runtime", + new=AsyncMock(return_value=execution_output or make_mock_execution_output()), + ): + await runtime.execute() + + return tracer, runtime + + +# --- Test classes --- + + +class TestEvalSetRunSpan: + """Tests for the top-level 'Evaluation Set Run' span produced by execute().""" @pytest.mark.asyncio - async def test_execute_creates_eval_set_run_span(self) -> None: - """execute() with one eval item produces an 'Evaluation Set Run' span.""" - tracer = SpanCapturingTracer() + async def test_span_created_with_correct_name_and_type(self) -> None: evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) - eval_item = make_eval_item(evaluation_criterias={"acc": {}}) - runtime = self._make_runtime_with_evaluations( - tracer, [eval_item], [evaluator] - ) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime.execute() + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) spans = tracer.get_spans_by_type("eval_set_run") assert len(spans) == 1 assert spans[0].name == "Evaluation Set Run" + assert spans[0].attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio - async def test_eval_set_run_span_has_aggregate_scores(self) -> None: - """After evaluations complete, the span output contains aggregate scores.""" - tracer = SpanCapturingTracer() + async def test_aggregate_scores_from_multiple_items(self) -> None: + """Scores are averaged across all eval items and written to the span.""" evaluator = make_evaluator(name="Accuracy", evaluator_id="acc", score=0.8) items = [ - make_eval_item(item_id="item-1", name="E1", evaluation_criterias={"acc": {}}), - make_eval_item(item_id="item-2", name="E2", evaluation_criterias={"acc": {}}), + make_eval_item( + item_id="i1", name="E1", evaluation_criterias={"acc": {}} + ), + make_eval_item( + item_id="i2", name="E2", evaluation_criterias={"acc": {}} + ), ] - runtime = self._make_runtime_with_evaluations(tracer, items, [evaluator]) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime.execute() + tracer, _ = await run_evaluation(items, [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] output = json.loads(span.attributes["output"]) - assert "scores" in output - assert "Accuracy" in output["scores"] - # 0.8 normalized to 80.0 - assert output["scores"]["Accuracy"] == 80.0 + assert output["scores"]["Accuracy"] == 80.0 # 0.8 -> 80.0 @pytest.mark.asyncio - async def test_eval_set_run_span_has_metadata(self) -> None: - """After evaluations complete, span has agentId, agentName, schemas.""" - tracer = SpanCapturingTracer() + async def test_metadata_attributes(self) -> None: evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=1.0) - eval_item = make_eval_item(evaluation_criterias={"acc": {}}) - runtime = self._make_runtime_with_evaluations( - tracer, [eval_item], [evaluator] - ) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime.execute() + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] assert span.attributes["agentName"] == "N/A" @@ -286,455 +248,275 @@ async def test_eval_set_run_span_has_metadata(self) -> None: @pytest.mark.asyncio async def test_eval_set_run_id_included_when_provided(self) -> None: - tracer = SpanCapturingTracer() evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) - eval_item = make_eval_item(evaluation_criterias={"acc": {}}) - runtime = self._make_runtime_with_evaluations( - tracer, [eval_item], [evaluator], eval_set_run_id="custom-run-abc" + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation( + [item], [evaluator], eval_set_run_id="custom-run-abc" ) - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime.execute() - span = tracer.get_spans_by_type("eval_set_run")[0] assert span.attributes["eval_set_run_id"] == "custom-run-abc" @pytest.mark.asyncio async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: - tracer = SpanCapturingTracer() evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) - eval_item = make_eval_item(evaluation_criterias={"acc": {}}) - runtime = self._make_runtime_with_evaluations( - tracer, [eval_item], [evaluator] - ) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime.execute() + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] assert "eval_set_run_id" not in span.attributes - @pytest.mark.asyncio - async def test_execute_produces_full_span_hierarchy(self) -> None: - """execute() with eval items produces the full span tree: - Evaluation Set Run -> Evaluation -> Evaluator -> Evaluation output. - """ - tracer = SpanCapturingTracer() - evaluator = make_evaluator( - name="Relevance", evaluator_id="rel", score=0.95 - ) - eval_item = make_eval_item( - item_id="item-1", name="E1", evaluation_criterias={"rel": {}} - ) - runtime = self._make_runtime_with_evaluations( - tracer, [eval_item], [evaluator] - ) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime.execute() - - # All four span types should be present - assert len(tracer.get_spans_by_type("eval_set_run")) == 1 - assert len(tracer.get_spans_by_type("evaluation")) == 1 - assert len(tracer.get_spans_by_type("evaluator")) == 1 - assert len(tracer.get_spans_by_attr("span.type", "evalOutput")) == 1 - class TestEvaluationSpan: - """Tests that runtime._execute_eval() creates the 'Evaluation' span.""" + """Tests for the 'Evaluation' span — one per eval item.""" @pytest.mark.asyncio - async def test_execute_eval_creates_evaluation_span(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - eval_item = make_eval_item() - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime._execute_eval(eval_item, []) + async def test_one_span_per_eval_item(self) -> None: + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + items = [ + make_eval_item( + item_id="i1", name="First", evaluation_criterias={"acc": {}} + ), + make_eval_item( + item_id="i2", name="Second", evaluation_criterias={"acc": {}} + ), + ] + tracer, _ = await run_evaluation(items, [evaluator]) spans = tracer.get_spans_by_type("evaluation") - assert len(spans) == 1 - assert spans[0].name == "Evaluation" + assert len(spans) == 2 @pytest.mark.asyncio - async def test_evaluation_span_has_eval_item_attributes(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - eval_item = make_eval_item(item_id="my-item-99", name="My Special Eval") - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime._execute_eval(eval_item, []) + async def test_span_has_eval_item_attributes(self) -> None: + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + item = make_eval_item( + item_id="my-item-99", + name="My Special Eval", + evaluation_criterias={"acc": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] assert span.attributes["eval_item_id"] == "my-item-99" assert span.attributes["eval_item_name"] == "My Special Eval" - - @pytest.mark.asyncio - async def test_evaluation_span_has_execution_id_from_eval_item(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - eval_item = make_eval_item(item_id="item-abc") - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime._execute_eval(eval_item, []) - - span = tracer.get_spans_by_type("evaluation")[0] - assert span.attributes["execution.id"] == "item-abc" - - @pytest.mark.asyncio - async def test_evaluation_span_has_custom_instrumentation_flag(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - eval_item = make_eval_item() - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime._execute_eval(eval_item, []) - - span = tracer.get_spans_by_type("evaluation")[0] + assert span.attributes["execution.id"] == "my-item-99" assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio - async def test_evaluation_span_configured_with_scores_after_evaluators( - self, - ) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - + async def test_span_configured_with_per_item_scores(self) -> None: evaluator = make_evaluator( - name="Accuracy", evaluator_id="acc-eval", score=0.85 + name="Accuracy", evaluator_id="acc", score=0.85 ) - eval_item = make_eval_item(evaluation_criterias={"acc-eval": {}}) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime._execute_eval(eval_item, [evaluator]) + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] - assert "output" in span.attributes output = json.loads(span.attributes["output"]) assert "scores" in output assert "Accuracy" in output["scores"] @pytest.mark.asyncio - async def test_evaluation_span_has_metadata_after_execution(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - eval_item = make_eval_item(inputs={"query": "test"}) - - with patch.object( - runtime, - "execute_runtime", - new=AsyncMock(return_value=make_mock_execution_output()), - ): - await runtime._execute_eval(eval_item, []) + async def test_span_has_metadata(self) -> None: + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + item = make_eval_item( + inputs={"query": "test"}, evaluation_criterias={"acc": {}} + ) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] - assert span.attributes.get("agentName") == "N/A" + assert span.attributes["agentName"] == "N/A" assert "agentId" in span.attributes class TestEvaluatorSpan: - """Tests that runtime.run_evaluator() creates the 'Evaluator: {name}' span.""" + """Tests for the 'Evaluator: {name}' span — one per evaluator per item.""" @pytest.mark.asyncio - async def test_run_evaluator_creates_evaluator_span(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(name="AccuracyEvaluator", evaluator_id="acc-1") - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, - ) - - spans = tracer.get_spans_by_type("evaluator") - assert len(spans) == 1 - assert spans[0].name == "Evaluator: AccuracyEvaluator" - - @pytest.mark.asyncio - async def test_evaluator_span_has_correct_attributes(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) + async def test_span_has_correct_name_and_attributes(self) -> None: evaluator = make_evaluator( - name="RelevanceEvaluator", evaluator_id="rel-eval-42" + name="RelevanceEvaluator", evaluator_id="rel-42", score=0.9 ) - eval_item = make_eval_item(item_id="eval-item-77") - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + item = make_eval_item( + item_id="eval-item-77", evaluation_criterias={"rel-42": {}} ) + tracer, _ = await run_evaluation([item], [evaluator]) - span = tracer.get_spans_by_type("evaluator")[0] - assert span.attributes["evaluator_id"] == "rel-eval-42" + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 1 + span = spans[0] + assert span.name == "Evaluator: RelevanceEvaluator" + assert span.attributes["evaluator_id"] == "rel-42" assert span.attributes["evaluator_name"] == "RelevanceEvaluator" assert span.attributes["eval_item_id"] == "eval-item-77" assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio - async def test_multiple_evaluators_create_separate_spans(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - names = ["Accuracy", "Relevance", "Fluency"] - for name in names: - evaluator = make_evaluator(name=name, evaluator_id=f"{name.lower()}-id") - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, - ) + async def test_multiple_evaluators_produce_multiple_spans(self) -> None: + evaluators = [ + make_evaluator(name="Accuracy", evaluator_id="acc", score=0.9), + make_evaluator(name="Relevance", evaluator_id="rel", score=0.8), + make_evaluator(name="Fluency", evaluator_id="flu", score=0.7), + ] + item = make_eval_item( + evaluation_criterias={"acc": {}, "rel": {}, "flu": {}} + ) + tracer, _ = await run_evaluation([item], evaluators) spans = tracer.get_spans_by_type("evaluator") assert len(spans) == 3 - span_names = [s.name for s in spans] - assert "Evaluator: Accuracy" in span_names - assert "Evaluator: Relevance" in span_names - assert "Evaluator: Fluency" in span_names + span_names = {s.name for s in spans} + assert span_names == { + "Evaluator: Accuracy", + "Evaluator: Relevance", + "Evaluator: Fluency", + } + @pytest.mark.asyncio + async def test_multiple_items_each_get_evaluator_spans(self) -> None: + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + items = [ + make_eval_item( + item_id="i1", name="E1", evaluation_criterias={"acc": {}} + ), + make_eval_item( + item_id="i2", name="E2", evaluation_criterias={"acc": {}} + ), + ] + tracer, _ = await run_evaluation(items, [evaluator]) -class TestEvaluationOutputSpan: - """Tests that run_evaluator() creates the child 'Evaluation output' span.""" + spans = tracer.get_spans_by_type("evaluator") + assert len(spans) == 2 + item_ids = {s.attributes["eval_item_id"] for s in spans} + assert item_ids == {"i1", "i2"} - @pytest.mark.asyncio - async def test_run_evaluator_creates_eval_output_span(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(score=0.9) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, - ) - output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") - assert len(output_spans) == 1 - assert output_spans[0].name == "Evaluation output" +class TestEvaluationOutputSpan: + """Tests for the 'Evaluation output' span — the evaluator's score.""" @pytest.mark.asyncio - async def test_eval_output_span_has_score_and_evaluator_id(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) + async def test_span_created_with_correct_attributes(self) -> None: evaluator = make_evaluator( - evaluator_id="my-eval-id", score=0.75 - ) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + name="Acc", evaluator_id="my-eval-id", score=0.75 ) + item = make_eval_item(evaluation_criterias={"my-eval-id": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) - span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] + output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") + assert len(output_spans) == 1 + span = output_spans[0] + assert span.name == "Evaluation output" assert span.attributes["value"] == 0.75 assert span.attributes["evaluatorId"] == "my-eval-id" + assert span.attributes["openinference.span.kind"] == "CHAIN" + assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio - async def test_eval_output_span_has_openinference_kind(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator() - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, - ) + async def test_output_json_has_normalized_score_and_type(self) -> None: + evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.85) + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] - assert span.attributes["openinference.span.kind"] == "CHAIN" + output = json.loads(span.attributes["output"]) + assert output["type"] == 1 + assert output["score"] == 85.0 # 0.85 normalized to 0-100 @pytest.mark.asyncio - async def test_eval_output_span_justification_from_pydantic_details(self) -> None: + async def test_justification_from_pydantic_details(self) -> None: class EvalDetails(BaseModel): justification: str extra: str = "ignored" details = EvalDetails(justification="Semantically equivalent output") - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(score=0.92, details=details) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + evaluator = make_evaluator( + name="Acc", evaluator_id="acc", score=0.92, details=details ) + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] assert span.attributes["justification"] == "Semantically equivalent output" @pytest.mark.asyncio - async def test_eval_output_span_justification_from_string_details(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(score=0.8, details="Good accuracy overall") - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + async def test_justification_from_string_details(self) -> None: + evaluator = make_evaluator( + name="Acc", evaluator_id="acc", score=0.8, details="Good accuracy" ) + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] - assert span.attributes["justification"] == "Good accuracy overall" + assert span.attributes["justification"] == "Good accuracy" @pytest.mark.asyncio - async def test_eval_output_span_no_justification_when_no_details(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(score=1.0, details=None) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + async def test_no_justification_when_no_details(self) -> None: + evaluator = make_evaluator( + name="Acc", evaluator_id="acc", score=1.0, details=None ) + item = make_eval_item(evaluation_criterias={"acc": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] assert "justification" not in span.attributes - @pytest.mark.asyncio - async def test_eval_output_span_output_has_normalized_score(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(score=0.85) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, - ) - span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] - output = json.loads(span.attributes["output"]) - # 0.85 normalized to 0-100 range - assert output["score"] == 85.0 +class TestSpanHierarchy: + """Tests that the full span tree is produced in the correct structure.""" @pytest.mark.asyncio - async def test_eval_output_span_output_type_always_one(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(score=0.5) - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + async def test_full_span_tree(self) -> None: + """One item + one evaluator produces all four span types.""" + evaluator = make_evaluator( + name="Relevance", evaluator_id="rel", score=0.95 ) + item = make_eval_item( + item_id="item-1", name="E1", evaluation_criterias={"rel": {}} + ) + tracer, _ = await run_evaluation([item], [evaluator]) - span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] - output = json.loads(span.attributes["output"]) - assert output["type"] == 1 - - -class TestSpanHierarchy: - """Tests that spans are created in the correct order/nesting.""" + assert len(tracer.get_spans_by_type("eval_set_run")) == 1 + assert len(tracer.get_spans_by_type("evaluation")) == 1 + assert len(tracer.get_spans_by_type("evaluator")) == 1 + assert len(tracer.get_spans_by_attr("span.type", "evalOutput")) == 1 @pytest.mark.asyncio - async def test_run_evaluator_creates_both_evaluator_and_output_spans(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(name="TestEval") - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, + async def test_span_ordering(self) -> None: + """Spans are created in the correct order: parent before child.""" + evaluator = make_evaluator( + name="OrderTest", evaluator_id="ord", score=0.9 + ) + item = make_eval_item(evaluation_criterias={"ord": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + names = [s.name for s in tracer.captured_spans] + assert names.index("Evaluation Set Run") < names.index("Evaluation") + assert names.index("Evaluation") < names.index("Evaluator: OrderTest") + assert names.index("Evaluator: OrderTest") < names.index( + "Evaluation output" ) - - # Should have both an evaluator span and an eval output span - evaluator_spans = tracer.get_spans_by_type("evaluator") - output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") - assert len(evaluator_spans) == 1 - assert len(output_spans) == 1 @pytest.mark.asyncio - async def test_eval_output_span_created_after_evaluator_span(self) -> None: - tracer = SpanCapturingTracer() - runtime = make_runtime(tracer) - evaluator = make_evaluator(name="OrderTest") - eval_item = make_eval_item() - execution_output = make_mock_execution_output() - - await runtime.run_evaluator( - evaluator=evaluator, - execution_output=execution_output, - eval_item=eval_item, - evaluation_criteria=None, - ) + async def test_multiple_items_and_evaluators(self) -> None: + """Two items x two evaluators produces the expected span counts.""" + evaluators = [ + make_evaluator(name="Acc", evaluator_id="acc", score=0.9), + make_evaluator(name="Rel", evaluator_id="rel", score=0.8), + ] + items = [ + make_eval_item( + item_id="i1", + name="E1", + evaluation_criterias={"acc": {}, "rel": {}}, + ), + make_eval_item( + item_id="i2", + name="E2", + evaluation_criterias={"acc": {}, "rel": {}}, + ), + ] + tracer, _ = await run_evaluation(items, evaluators) - # In the captured list, the evaluator span should appear before the output span - span_names = [s.name for s in tracer.captured_spans] - evaluator_idx = span_names.index("Evaluator: OrderTest") - output_idx = span_names.index("Evaluation output") - assert evaluator_idx < output_idx + assert len(tracer.get_spans_by_type("eval_set_run")) == 1 + assert len(tracer.get_spans_by_type("evaluation")) == 2 + assert len(tracer.get_spans_by_type("evaluator")) == 4 # 2 items x 2 evaluators + assert len(tracer.get_spans_by_attr("span.type", "evalOutput")) == 4 From 3149fd41157d5a8ba3fcd5b8e630f115ad81f071 Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 10:23:57 -0700 Subject: [PATCH 4/7] test: use descriptive names for evaluators, items, and IDs Replace terse abbreviations (Acc, rel, i1, E1) with meaningful names that convey what is being evaluated (ExactMatchEvaluator, calculator-addition, sentiment-positive-review, etc.). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 299 +++++++++++++----- 1 file changed, 219 insertions(+), 80 deletions(-) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index 7154f4ba3..bc74adb0e 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -101,8 +101,8 @@ def make_mock_execution_output( def make_evaluator( - name: str = "AccuracyEvaluator", - evaluator_id: str = "accuracy-eval", + name: str = "ExactMatchEvaluator", + evaluator_id: str = "exact-match-evaluator", score: float = 0.95, details: Any = None, ) -> MagicMock: @@ -121,8 +121,8 @@ def make_evaluator( def make_eval_item( - item_id: str = "item-123", - name: str = "Test Evaluation", + item_id: str = "eval-item-default", + name: str = "Default Test Case", inputs: dict[str, Any] | None = None, evaluation_criterias: dict[str, Any] | None = None, ) -> EvaluationItem: @@ -207,8 +207,14 @@ class TestEvalSetRunSpan: @pytest.mark.asyncio async def test_span_created_with_correct_name_and_type(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) - item = make_eval_item(evaluation_criterias={"acc": {}}) + evaluator = make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.9, + ) + item = make_eval_item( + evaluation_criterias={"exact-match-evaluator": {}} + ) tracer, _ = await run_evaluation([item], [evaluator]) spans = tracer.get_spans_by_type("eval_set_run") @@ -219,25 +225,39 @@ async def test_span_created_with_correct_name_and_type(self) -> None: @pytest.mark.asyncio async def test_aggregate_scores_from_multiple_items(self) -> None: """Scores are averaged across all eval items and written to the span.""" - evaluator = make_evaluator(name="Accuracy", evaluator_id="acc", score=0.8) + evaluator = make_evaluator( + name="JsonSimilarityEvaluator", + evaluator_id="json-similarity-evaluator", + score=0.8, + ) items = [ make_eval_item( - item_id="i1", name="E1", evaluation_criterias={"acc": {}} + item_id="calculator-addition", + name="Calculator addition test", + evaluation_criterias={"json-similarity-evaluator": {}}, ), make_eval_item( - item_id="i2", name="E2", evaluation_criterias={"acc": {}} + item_id="calculator-subtraction", + name="Calculator subtraction test", + evaluation_criterias={"json-similarity-evaluator": {}}, ), ] tracer, _ = await run_evaluation(items, [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] output = json.loads(span.attributes["output"]) - assert output["scores"]["Accuracy"] == 80.0 # 0.8 -> 80.0 + assert output["scores"]["JsonSimilarityEvaluator"] == 80.0 # 0.8 -> 80.0 @pytest.mark.asyncio async def test_metadata_attributes(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=1.0) - item = make_eval_item(evaluation_criterias={"acc": {}}) + evaluator = make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=1.0, + ) + item = make_eval_item( + evaluation_criterias={"exact-match-evaluator": {}} + ) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] @@ -248,19 +268,31 @@ async def test_metadata_attributes(self) -> None: @pytest.mark.asyncio async def test_eval_set_run_id_included_when_provided(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) - item = make_eval_item(evaluation_criterias={"acc": {}}) + evaluator = make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.9, + ) + item = make_eval_item( + evaluation_criterias={"exact-match-evaluator": {}} + ) tracer, _ = await run_evaluation( - [item], [evaluator], eval_set_run_id="custom-run-abc" + [item], [evaluator], eval_set_run_id="run-2024-04-14-001" ) span = tracer.get_spans_by_type("eval_set_run")[0] - assert span.attributes["eval_set_run_id"] == "custom-run-abc" + assert span.attributes["eval_set_run_id"] == "run-2024-04-14-001" @pytest.mark.asyncio async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) - item = make_eval_item(evaluation_criterias={"acc": {}}) + evaluator = make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.9, + ) + item = make_eval_item( + evaluation_criterias={"exact-match-evaluator": {}} + ) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] @@ -272,13 +304,21 @@ class TestEvaluationSpan: @pytest.mark.asyncio async def test_one_span_per_eval_item(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + evaluator = make_evaluator( + name="ContainsEvaluator", + evaluator_id="contains-evaluator", + score=0.9, + ) items = [ make_eval_item( - item_id="i1", name="First", evaluation_criterias={"acc": {}} + item_id="greeting-english", + name="English greeting test", + evaluation_criterias={"contains-evaluator": {}}, ), make_eval_item( - item_id="i2", name="Second", evaluation_criterias={"acc": {}} + item_id="greeting-spanish", + name="Spanish greeting test", + evaluation_criterias={"contains-evaluator": {}}, ), ] tracer, _ = await run_evaluation(items, [evaluator]) @@ -288,38 +328,51 @@ async def test_one_span_per_eval_item(self) -> None: @pytest.mark.asyncio async def test_span_has_eval_item_attributes(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + evaluator = make_evaluator( + name="ContainsEvaluator", + evaluator_id="contains-evaluator", + score=0.9, + ) item = make_eval_item( - item_id="my-item-99", - name="My Special Eval", - evaluation_criterias={"acc": {}}, + item_id="sentiment-positive-review", + name="Positive review sentiment check", + evaluation_criterias={"contains-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] - assert span.attributes["eval_item_id"] == "my-item-99" - assert span.attributes["eval_item_name"] == "My Special Eval" - assert span.attributes["execution.id"] == "my-item-99" + assert span.attributes["eval_item_id"] == "sentiment-positive-review" + assert span.attributes["eval_item_name"] == "Positive review sentiment check" + assert span.attributes["execution.id"] == "sentiment-positive-review" assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio async def test_span_configured_with_per_item_scores(self) -> None: evaluator = make_evaluator( - name="Accuracy", evaluator_id="acc", score=0.85 + name="JsonSimilarityEvaluator", + evaluator_id="json-similarity-evaluator", + score=0.85, + ) + item = make_eval_item( + evaluation_criterias={"json-similarity-evaluator": {}} ) - item = make_eval_item(evaluation_criterias={"acc": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] output = json.loads(span.attributes["output"]) assert "scores" in output - assert "Accuracy" in output["scores"] + assert "JsonSimilarityEvaluator" in output["scores"] @pytest.mark.asyncio async def test_span_has_metadata(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + evaluator = make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.9, + ) item = make_eval_item( - inputs={"query": "test"}, evaluation_criterias={"acc": {}} + inputs={"query": "What is the capital of France?"}, + evaluation_criterias={"exact-match-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) @@ -334,31 +387,51 @@ class TestEvaluatorSpan: @pytest.mark.asyncio async def test_span_has_correct_name_and_attributes(self) -> None: evaluator = make_evaluator( - name="RelevanceEvaluator", evaluator_id="rel-42", score=0.9 + name="LLMJudgeTrajectoryEvaluator", + evaluator_id="llm-judge-trajectory", + score=0.9, ) item = make_eval_item( - item_id="eval-item-77", evaluation_criterias={"rel-42": {}} + item_id="multi-step-tool-use", + name="Multi-step tool use trajectory", + evaluation_criterias={"llm-judge-trajectory": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) spans = tracer.get_spans_by_type("evaluator") assert len(spans) == 1 span = spans[0] - assert span.name == "Evaluator: RelevanceEvaluator" - assert span.attributes["evaluator_id"] == "rel-42" - assert span.attributes["evaluator_name"] == "RelevanceEvaluator" - assert span.attributes["eval_item_id"] == "eval-item-77" + assert span.name == "Evaluator: LLMJudgeTrajectoryEvaluator" + assert span.attributes["evaluator_id"] == "llm-judge-trajectory" + assert span.attributes["evaluator_name"] == "LLMJudgeTrajectoryEvaluator" + assert span.attributes["eval_item_id"] == "multi-step-tool-use" assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio async def test_multiple_evaluators_produce_multiple_spans(self) -> None: evaluators = [ - make_evaluator(name="Accuracy", evaluator_id="acc", score=0.9), - make_evaluator(name="Relevance", evaluator_id="rel", score=0.8), - make_evaluator(name="Fluency", evaluator_id="flu", score=0.7), + make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.9, + ), + make_evaluator( + name="JsonSimilarityEvaluator", + evaluator_id="json-similarity-evaluator", + score=0.8, + ), + make_evaluator( + name="LLMJudgeOutputEvaluator", + evaluator_id="llm-judge-output-evaluator", + score=0.7, + ), ] item = make_eval_item( - evaluation_criterias={"acc": {}, "rel": {}, "flu": {}} + evaluation_criterias={ + "exact-match-evaluator": {}, + "json-similarity-evaluator": {}, + "llm-judge-output-evaluator": {}, + } ) tracer, _ = await run_evaluation([item], evaluators) @@ -366,20 +439,28 @@ async def test_multiple_evaluators_produce_multiple_spans(self) -> None: assert len(spans) == 3 span_names = {s.name for s in spans} assert span_names == { - "Evaluator: Accuracy", - "Evaluator: Relevance", - "Evaluator: Fluency", + "Evaluator: ExactMatchEvaluator", + "Evaluator: JsonSimilarityEvaluator", + "Evaluator: LLMJudgeOutputEvaluator", } @pytest.mark.asyncio async def test_multiple_items_each_get_evaluator_spans(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.9) + evaluator = make_evaluator( + name="ContainsEvaluator", + evaluator_id="contains-evaluator", + score=0.9, + ) items = [ make_eval_item( - item_id="i1", name="E1", evaluation_criterias={"acc": {}} + item_id="invoice-extraction", + name="Invoice data extraction", + evaluation_criterias={"contains-evaluator": {}}, ), make_eval_item( - item_id="i2", name="E2", evaluation_criterias={"acc": {}} + item_id="receipt-extraction", + name="Receipt data extraction", + evaluation_criterias={"contains-evaluator": {}}, ), ] tracer, _ = await run_evaluation(items, [evaluator]) @@ -387,7 +468,7 @@ async def test_multiple_items_each_get_evaluator_spans(self) -> None: spans = tracer.get_spans_by_type("evaluator") assert len(spans) == 2 item_ids = {s.attributes["eval_item_id"] for s in spans} - assert item_ids == {"i1", "i2"} + assert item_ids == {"invoice-extraction", "receipt-extraction"} class TestEvaluationOutputSpan: @@ -396,9 +477,13 @@ class TestEvaluationOutputSpan: @pytest.mark.asyncio async def test_span_created_with_correct_attributes(self) -> None: evaluator = make_evaluator( - name="Acc", evaluator_id="my-eval-id", score=0.75 + name="ToolCallArgsEvaluator", + evaluator_id="tool-call-args-evaluator", + score=0.75, + ) + item = make_eval_item( + evaluation_criterias={"tool-call-args-evaluator": {}} ) - item = make_eval_item(evaluation_criterias={"my-eval-id": {}}) tracer, _ = await run_evaluation([item], [evaluator]) output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") @@ -406,14 +491,20 @@ async def test_span_created_with_correct_attributes(self) -> None: span = output_spans[0] assert span.name == "Evaluation output" assert span.attributes["value"] == 0.75 - assert span.attributes["evaluatorId"] == "my-eval-id" + assert span.attributes["evaluatorId"] == "tool-call-args-evaluator" assert span.attributes["openinference.span.kind"] == "CHAIN" assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio async def test_output_json_has_normalized_score_and_type(self) -> None: - evaluator = make_evaluator(name="Acc", evaluator_id="acc", score=0.85) - item = make_eval_item(evaluation_criterias={"acc": {}}) + evaluator = make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.85, + ) + item = make_eval_item( + evaluation_criterias={"exact-match-evaluator": {}} + ) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] @@ -423,37 +514,61 @@ async def test_output_json_has_normalized_score_and_type(self) -> None: @pytest.mark.asyncio async def test_justification_from_pydantic_details(self) -> None: - class EvalDetails(BaseModel): + class SemanticSimilarityDetails(BaseModel): justification: str - extra: str = "ignored" + similarity_score: float = 0.0 - details = EvalDetails(justification="Semantically equivalent output") + details = SemanticSimilarityDetails( + justification="Agent output is semantically equivalent to expected output", + similarity_score=0.92, + ) evaluator = make_evaluator( - name="Acc", evaluator_id="acc", score=0.92, details=details + name="LLMJudgeOutputEvaluator", + evaluator_id="llm-judge-output-evaluator", + score=0.92, + details=details, + ) + item = make_eval_item( + evaluation_criterias={"llm-judge-output-evaluator": {}} ) - item = make_eval_item(evaluation_criterias={"acc": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] - assert span.attributes["justification"] == "Semantically equivalent output" + assert ( + span.attributes["justification"] + == "Agent output is semantically equivalent to expected output" + ) @pytest.mark.asyncio async def test_justification_from_string_details(self) -> None: evaluator = make_evaluator( - name="Acc", evaluator_id="acc", score=0.8, details="Good accuracy" + name="BinaryClassificationEvaluator", + evaluator_id="binary-classification-evaluator", + score=0.8, + details="Output correctly classified as positive sentiment", + ) + item = make_eval_item( + evaluation_criterias={"binary-classification-evaluator": {}} ) - item = make_eval_item(evaluation_criterias={"acc": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] - assert span.attributes["justification"] == "Good accuracy" + assert ( + span.attributes["justification"] + == "Output correctly classified as positive sentiment" + ) @pytest.mark.asyncio async def test_no_justification_when_no_details(self) -> None: evaluator = make_evaluator( - name="Acc", evaluator_id="acc", score=1.0, details=None + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=1.0, + details=None, + ) + item = make_eval_item( + evaluation_criterias={"exact-match-evaluator": {}} ) - item = make_eval_item(evaluation_criterias={"acc": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] @@ -467,10 +582,14 @@ class TestSpanHierarchy: async def test_full_span_tree(self) -> None: """One item + one evaluator produces all four span types.""" evaluator = make_evaluator( - name="Relevance", evaluator_id="rel", score=0.95 + name="ToolCallOrderEvaluator", + evaluator_id="tool-call-order-evaluator", + score=0.95, ) item = make_eval_item( - item_id="item-1", name="E1", evaluation_criterias={"rel": {}} + item_id="booking-flow-happy-path", + name="Booking flow happy path", + evaluation_criterias={"tool-call-order-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) @@ -483,15 +602,21 @@ async def test_full_span_tree(self) -> None: async def test_span_ordering(self) -> None: """Spans are created in the correct order: parent before child.""" evaluator = make_evaluator( - name="OrderTest", evaluator_id="ord", score=0.9 + name="ContainsEvaluator", + evaluator_id="contains-evaluator", + score=0.9, + ) + item = make_eval_item( + evaluation_criterias={"contains-evaluator": {}} ) - item = make_eval_item(evaluation_criterias={"ord": {}}) tracer, _ = await run_evaluation([item], [evaluator]) names = [s.name for s in tracer.captured_spans] assert names.index("Evaluation Set Run") < names.index("Evaluation") - assert names.index("Evaluation") < names.index("Evaluator: OrderTest") - assert names.index("Evaluator: OrderTest") < names.index( + assert names.index("Evaluation") < names.index( + "Evaluator: ContainsEvaluator" + ) + assert names.index("Evaluator: ContainsEvaluator") < names.index( "Evaluation output" ) @@ -499,19 +624,33 @@ async def test_span_ordering(self) -> None: async def test_multiple_items_and_evaluators(self) -> None: """Two items x two evaluators produces the expected span counts.""" evaluators = [ - make_evaluator(name="Acc", evaluator_id="acc", score=0.9), - make_evaluator(name="Rel", evaluator_id="rel", score=0.8), + make_evaluator( + name="ExactMatchEvaluator", + evaluator_id="exact-match-evaluator", + score=0.9, + ), + make_evaluator( + name="JsonSimilarityEvaluator", + evaluator_id="json-similarity-evaluator", + score=0.8, + ), ] items = [ make_eval_item( - item_id="i1", - name="E1", - evaluation_criterias={"acc": {}, "rel": {}}, + item_id="api-response-format", + name="API response format validation", + evaluation_criterias={ + "exact-match-evaluator": {}, + "json-similarity-evaluator": {}, + }, ), make_eval_item( - item_id="i2", - name="E2", - evaluation_criterias={"acc": {}, "rel": {}}, + item_id="error-handling-graceful", + name="Graceful error handling check", + evaluation_criterias={ + "exact-match-evaluator": {}, + "json-similarity-evaluator": {}, + }, ), ] tracer, _ = await run_evaluation(items, evaluators) From 882e4bbb954f1e4f0c83b65e5c4cd877124c472a Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 10:32:00 -0700 Subject: [PATCH 5/7] test: use MockEvaluator naming for mock evaluators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are mocks, not real evaluator implementations — name them accordingly. Tests that need one evaluator use the default MockEvaluator; tests that need multiple use MockEvaluatorA/B/C. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 236 +++++------------- 1 file changed, 66 insertions(+), 170 deletions(-) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index bc74adb0e..29c805389 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -101,8 +101,8 @@ def make_mock_execution_output( def make_evaluator( - name: str = "ExactMatchEvaluator", - evaluator_id: str = "exact-match-evaluator", + name: str = "MockEvaluator", + evaluator_id: str = "mock-evaluator", score: float = 0.95, details: Any = None, ) -> MagicMock: @@ -207,14 +207,8 @@ class TestEvalSetRunSpan: @pytest.mark.asyncio async def test_span_created_with_correct_name_and_type(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.9, - ) - item = make_eval_item( - evaluation_criterias={"exact-match-evaluator": {}} - ) + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) spans = tracer.get_spans_by_type("eval_set_run") @@ -225,39 +219,29 @@ async def test_span_created_with_correct_name_and_type(self) -> None: @pytest.mark.asyncio async def test_aggregate_scores_from_multiple_items(self) -> None: """Scores are averaged across all eval items and written to the span.""" - evaluator = make_evaluator( - name="JsonSimilarityEvaluator", - evaluator_id="json-similarity-evaluator", - score=0.8, - ) + evaluator = make_evaluator(score=0.8) items = [ make_eval_item( item_id="calculator-addition", name="Calculator addition test", - evaluation_criterias={"json-similarity-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ), make_eval_item( item_id="calculator-subtraction", name="Calculator subtraction test", - evaluation_criterias={"json-similarity-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ), ] tracer, _ = await run_evaluation(items, [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] output = json.loads(span.attributes["output"]) - assert output["scores"]["JsonSimilarityEvaluator"] == 80.0 # 0.8 -> 80.0 + assert output["scores"]["MockEvaluator"] == 80.0 # 0.8 -> 80.0 @pytest.mark.asyncio async def test_metadata_attributes(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=1.0, - ) - item = make_eval_item( - evaluation_criterias={"exact-match-evaluator": {}} - ) + evaluator = make_evaluator(score=1.0) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] @@ -268,14 +252,8 @@ async def test_metadata_attributes(self) -> None: @pytest.mark.asyncio async def test_eval_set_run_id_included_when_provided(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.9, - ) - item = make_eval_item( - evaluation_criterias={"exact-match-evaluator": {}} - ) + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation( [item], [evaluator], eval_set_run_id="run-2024-04-14-001" ) @@ -285,14 +263,8 @@ async def test_eval_set_run_id_included_when_provided(self) -> None: @pytest.mark.asyncio async def test_eval_set_run_id_excluded_when_not_provided(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.9, - ) - item = make_eval_item( - evaluation_criterias={"exact-match-evaluator": {}} - ) + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("eval_set_run")[0] @@ -304,21 +276,17 @@ class TestEvaluationSpan: @pytest.mark.asyncio async def test_one_span_per_eval_item(self) -> None: - evaluator = make_evaluator( - name="ContainsEvaluator", - evaluator_id="contains-evaluator", - score=0.9, - ) + evaluator = make_evaluator(score=0.9) items = [ make_eval_item( item_id="greeting-english", name="English greeting test", - evaluation_criterias={"contains-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ), make_eval_item( item_id="greeting-spanish", name="Spanish greeting test", - evaluation_criterias={"contains-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ), ] tracer, _ = await run_evaluation(items, [evaluator]) @@ -328,15 +296,11 @@ async def test_one_span_per_eval_item(self) -> None: @pytest.mark.asyncio async def test_span_has_eval_item_attributes(self) -> None: - evaluator = make_evaluator( - name="ContainsEvaluator", - evaluator_id="contains-evaluator", - score=0.9, - ) + evaluator = make_evaluator(score=0.9) item = make_eval_item( item_id="sentiment-positive-review", name="Positive review sentiment check", - evaluation_criterias={"contains-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) @@ -348,31 +312,21 @@ async def test_span_has_eval_item_attributes(self) -> None: @pytest.mark.asyncio async def test_span_configured_with_per_item_scores(self) -> None: - evaluator = make_evaluator( - name="JsonSimilarityEvaluator", - evaluator_id="json-similarity-evaluator", - score=0.85, - ) - item = make_eval_item( - evaluation_criterias={"json-similarity-evaluator": {}} - ) + evaluator = make_evaluator(score=0.85) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] output = json.loads(span.attributes["output"]) assert "scores" in output - assert "JsonSimilarityEvaluator" in output["scores"] + assert "MockEvaluator" in output["scores"] @pytest.mark.asyncio async def test_span_has_metadata(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.9, - ) + evaluator = make_evaluator(score=0.9) item = make_eval_item( inputs={"query": "What is the capital of France?"}, - evaluation_criterias={"exact-match-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) @@ -386,24 +340,20 @@ class TestEvaluatorSpan: @pytest.mark.asyncio async def test_span_has_correct_name_and_attributes(self) -> None: - evaluator = make_evaluator( - name="LLMJudgeTrajectoryEvaluator", - evaluator_id="llm-judge-trajectory", - score=0.9, - ) + evaluator = make_evaluator(score=0.9) item = make_eval_item( item_id="multi-step-tool-use", - name="Multi-step tool use trajectory", - evaluation_criterias={"llm-judge-trajectory": {}}, + name="Multi-step tool use test case", + evaluation_criterias={"mock-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) spans = tracer.get_spans_by_type("evaluator") assert len(spans) == 1 span = spans[0] - assert span.name == "Evaluator: LLMJudgeTrajectoryEvaluator" - assert span.attributes["evaluator_id"] == "llm-judge-trajectory" - assert span.attributes["evaluator_name"] == "LLMJudgeTrajectoryEvaluator" + assert span.name == "Evaluator: MockEvaluator" + assert span.attributes["evaluator_id"] == "mock-evaluator" + assert span.attributes["evaluator_name"] == "MockEvaluator" assert span.attributes["eval_item_id"] == "multi-step-tool-use" assert span.attributes["uipath.custom_instrumentation"] is True @@ -411,26 +361,20 @@ async def test_span_has_correct_name_and_attributes(self) -> None: async def test_multiple_evaluators_produce_multiple_spans(self) -> None: evaluators = [ make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.9, + name="MockEvaluatorA", evaluator_id="mock-evaluator-a", score=0.9 ), make_evaluator( - name="JsonSimilarityEvaluator", - evaluator_id="json-similarity-evaluator", - score=0.8, + name="MockEvaluatorB", evaluator_id="mock-evaluator-b", score=0.8 ), make_evaluator( - name="LLMJudgeOutputEvaluator", - evaluator_id="llm-judge-output-evaluator", - score=0.7, + name="MockEvaluatorC", evaluator_id="mock-evaluator-c", score=0.7 ), ] item = make_eval_item( evaluation_criterias={ - "exact-match-evaluator": {}, - "json-similarity-evaluator": {}, - "llm-judge-output-evaluator": {}, + "mock-evaluator-a": {}, + "mock-evaluator-b": {}, + "mock-evaluator-c": {}, } ) tracer, _ = await run_evaluation([item], evaluators) @@ -439,28 +383,24 @@ async def test_multiple_evaluators_produce_multiple_spans(self) -> None: assert len(spans) == 3 span_names = {s.name for s in spans} assert span_names == { - "Evaluator: ExactMatchEvaluator", - "Evaluator: JsonSimilarityEvaluator", - "Evaluator: LLMJudgeOutputEvaluator", + "Evaluator: MockEvaluatorA", + "Evaluator: MockEvaluatorB", + "Evaluator: MockEvaluatorC", } @pytest.mark.asyncio async def test_multiple_items_each_get_evaluator_spans(self) -> None: - evaluator = make_evaluator( - name="ContainsEvaluator", - evaluator_id="contains-evaluator", - score=0.9, - ) + evaluator = make_evaluator(score=0.9) items = [ make_eval_item( item_id="invoice-extraction", name="Invoice data extraction", - evaluation_criterias={"contains-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ), make_eval_item( item_id="receipt-extraction", name="Receipt data extraction", - evaluation_criterias={"contains-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ), ] tracer, _ = await run_evaluation(items, [evaluator]) @@ -476,14 +416,8 @@ class TestEvaluationOutputSpan: @pytest.mark.asyncio async def test_span_created_with_correct_attributes(self) -> None: - evaluator = make_evaluator( - name="ToolCallArgsEvaluator", - evaluator_id="tool-call-args-evaluator", - score=0.75, - ) - item = make_eval_item( - evaluation_criterias={"tool-call-args-evaluator": {}} - ) + evaluator = make_evaluator(score=0.75) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) output_spans = tracer.get_spans_by_attr("span.type", "evalOutput") @@ -491,20 +425,14 @@ async def test_span_created_with_correct_attributes(self) -> None: span = output_spans[0] assert span.name == "Evaluation output" assert span.attributes["value"] == 0.75 - assert span.attributes["evaluatorId"] == "tool-call-args-evaluator" + assert span.attributes["evaluatorId"] == "mock-evaluator" assert span.attributes["openinference.span.kind"] == "CHAIN" assert span.attributes["uipath.custom_instrumentation"] is True @pytest.mark.asyncio async def test_output_json_has_normalized_score_and_type(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.85, - ) - item = make_eval_item( - evaluation_criterias={"exact-match-evaluator": {}} - ) + evaluator = make_evaluator(score=0.85) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] @@ -514,23 +442,16 @@ async def test_output_json_has_normalized_score_and_type(self) -> None: @pytest.mark.asyncio async def test_justification_from_pydantic_details(self) -> None: - class SemanticSimilarityDetails(BaseModel): + class MockEvaluatorDetails(BaseModel): justification: str similarity_score: float = 0.0 - details = SemanticSimilarityDetails( + details = MockEvaluatorDetails( justification="Agent output is semantically equivalent to expected output", similarity_score=0.92, ) - evaluator = make_evaluator( - name="LLMJudgeOutputEvaluator", - evaluator_id="llm-judge-output-evaluator", - score=0.92, - details=details, - ) - item = make_eval_item( - evaluation_criterias={"llm-judge-output-evaluator": {}} - ) + evaluator = make_evaluator(score=0.92, details=details) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] @@ -542,14 +463,10 @@ class SemanticSimilarityDetails(BaseModel): @pytest.mark.asyncio async def test_justification_from_string_details(self) -> None: evaluator = make_evaluator( - name="BinaryClassificationEvaluator", - evaluator_id="binary-classification-evaluator", score=0.8, details="Output correctly classified as positive sentiment", ) - item = make_eval_item( - evaluation_criterias={"binary-classification-evaluator": {}} - ) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] @@ -560,15 +477,8 @@ async def test_justification_from_string_details(self) -> None: @pytest.mark.asyncio async def test_no_justification_when_no_details(self) -> None: - evaluator = make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=1.0, - details=None, - ) - item = make_eval_item( - evaluation_criterias={"exact-match-evaluator": {}} - ) + evaluator = make_evaluator(score=1.0, details=None) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] @@ -581,15 +491,11 @@ class TestSpanHierarchy: @pytest.mark.asyncio async def test_full_span_tree(self) -> None: """One item + one evaluator produces all four span types.""" - evaluator = make_evaluator( - name="ToolCallOrderEvaluator", - evaluator_id="tool-call-order-evaluator", - score=0.95, - ) + evaluator = make_evaluator(score=0.95) item = make_eval_item( item_id="booking-flow-happy-path", name="Booking flow happy path", - evaluation_criterias={"tool-call-order-evaluator": {}}, + evaluation_criterias={"mock-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) @@ -601,22 +507,16 @@ async def test_full_span_tree(self) -> None: @pytest.mark.asyncio async def test_span_ordering(self) -> None: """Spans are created in the correct order: parent before child.""" - evaluator = make_evaluator( - name="ContainsEvaluator", - evaluator_id="contains-evaluator", - score=0.9, - ) - item = make_eval_item( - evaluation_criterias={"contains-evaluator": {}} - ) + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) tracer, _ = await run_evaluation([item], [evaluator]) names = [s.name for s in tracer.captured_spans] assert names.index("Evaluation Set Run") < names.index("Evaluation") assert names.index("Evaluation") < names.index( - "Evaluator: ContainsEvaluator" + "Evaluator: MockEvaluator" ) - assert names.index("Evaluator: ContainsEvaluator") < names.index( + assert names.index("Evaluator: MockEvaluator") < names.index( "Evaluation output" ) @@ -625,14 +525,10 @@ async def test_multiple_items_and_evaluators(self) -> None: """Two items x two evaluators produces the expected span counts.""" evaluators = [ make_evaluator( - name="ExactMatchEvaluator", - evaluator_id="exact-match-evaluator", - score=0.9, + name="MockEvaluatorA", evaluator_id="mock-evaluator-a", score=0.9 ), make_evaluator( - name="JsonSimilarityEvaluator", - evaluator_id="json-similarity-evaluator", - score=0.8, + name="MockEvaluatorB", evaluator_id="mock-evaluator-b", score=0.8 ), ] items = [ @@ -640,16 +536,16 @@ async def test_multiple_items_and_evaluators(self) -> None: item_id="api-response-format", name="API response format validation", evaluation_criterias={ - "exact-match-evaluator": {}, - "json-similarity-evaluator": {}, + "mock-evaluator-a": {}, + "mock-evaluator-b": {}, }, ), make_eval_item( item_id="error-handling-graceful", name="Graceful error handling check", evaluation_criterias={ - "exact-match-evaluator": {}, - "json-similarity-evaluator": {}, + "mock-evaluator-a": {}, + "mock-evaluator-b": {}, }, ), ] From e2521d19a0d713718e03a0b176030cf1ddb9db48 Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 13:18:49 -0700 Subject: [PATCH 6/7] test: add coverage for error status and input data on evaluation spans - Test that the evaluation span gets StatusCode.ERROR when the agent returns an error, and StatusCode.OK on success - Test that the evaluation span carries the eval item's inputs as serialized JSON in the input attribute Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index 29c805389..5ec97644e 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -334,6 +334,53 @@ async def test_span_has_metadata(self) -> None: assert span.attributes["agentName"] == "N/A" assert "agentId" in span.attributes + @pytest.mark.asyncio + async def test_span_has_input_data(self) -> None: + """The evaluation span carries the eval item's inputs as serialized JSON.""" + evaluator = make_evaluator(score=0.9) + item = make_eval_item( + inputs={"query": "Summarize this document", "max_tokens": 100}, + evaluation_criterias={"mock-evaluator": {}}, + ) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + assert "input" in span.attributes + input_data = json.loads(span.attributes["input"]) + assert input_data["query"] == "Summarize this document" + assert input_data["max_tokens"] == 100 + + @pytest.mark.asyncio + async def test_span_has_error_status_when_agent_fails(self) -> None: + """When the agent returns an error, the evaluation span gets ERROR status.""" + from opentelemetry.trace import StatusCode + + evaluator = make_evaluator(score=0.0) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + error_output = make_mock_execution_output( + error="Agent failed: connection timeout", + ) + tracer, _ = await run_evaluation( + [item], [evaluator], execution_output=error_output + ) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span._status is not None + assert span._status.status_code == StatusCode.ERROR + + @pytest.mark.asyncio + async def test_span_has_ok_status_on_success(self) -> None: + """When the agent succeeds, the evaluation span gets OK status.""" + from opentelemetry.trace import StatusCode + + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("evaluation")[0] + assert span._status is not None + assert span._status.status_code == StatusCode.OK + class TestEvaluatorSpan: """Tests for the 'Evaluator: {name}' span — one per evaluator per item.""" From 5b76e34173bf567a6685a0f2ab5dab414ff1196d Mon Sep 17 00:00:00 2001 From: Scott Florentino Date: Tue, 14 Apr 2026 13:29:41 -0700 Subject: [PATCH 7/7] test: tighten span attribute assertions - Assert agentId value (not just existence) on both eval set run and evaluation spans - Assert inputSchema/outputSchema content matches the runtime schema - Assert StatusCode.OK on the eval set run span - Assert evaluatorId and justification in the evaluation output span's output JSON (not just as direct span attributes) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../tests/cli/eval/test_eval_runtime_spans.py | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py index 5ec97644e..82e54006a 100644 --- a/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py +++ b/packages/uipath/tests/cli/eval/test_eval_runtime_spans.py @@ -242,13 +242,32 @@ async def test_aggregate_scores_from_multiple_items(self) -> None: async def test_metadata_attributes(self) -> None: evaluator = make_evaluator(score=1.0) item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) - tracer, _ = await run_evaluation([item], [evaluator]) + tracer, _ = await run_evaluation( + [item], [evaluator], execution_id="eval-set-run-exec-123" + ) span = tracer.get_spans_by_type("eval_set_run")[0] + assert span.attributes["agentId"] == "eval-set-run-exec-123" assert span.attributes["agentName"] == "N/A" - assert "agentId" in span.attributes - assert "inputSchema" in span.attributes - assert "outputSchema" in span.attributes + + input_schema = json.loads(span.attributes["inputSchema"]) + assert input_schema["type"] == "object" + assert "x" in input_schema["properties"] + + output_schema = json.loads(span.attributes["outputSchema"]) + assert output_schema["type"] == "object" + + @pytest.mark.asyncio + async def test_span_has_ok_status_on_success(self) -> None: + from opentelemetry.trace import StatusCode + + evaluator = make_evaluator(score=0.9) + item = make_eval_item(evaluation_criterias={"mock-evaluator": {}}) + tracer, _ = await run_evaluation([item], [evaluator]) + + span = tracer.get_spans_by_type("eval_set_run")[0] + assert span._status is not None + assert span._status.status_code == StatusCode.OK @pytest.mark.asyncio async def test_eval_set_run_id_included_when_provided(self) -> None: @@ -325,14 +344,16 @@ async def test_span_configured_with_per_item_scores(self) -> None: async def test_span_has_metadata(self) -> None: evaluator = make_evaluator(score=0.9) item = make_eval_item( + item_id="capital-city-query", inputs={"query": "What is the capital of France?"}, evaluation_criterias={"mock-evaluator": {}}, ) tracer, _ = await run_evaluation([item], [evaluator]) span = tracer.get_spans_by_type("evaluation")[0] + # agentId is set to the eval item's execution_id (== str(eval_item.id)) + assert span.attributes["agentId"] == "capital-city-query" assert span.attributes["agentName"] == "N/A" - assert "agentId" in span.attributes @pytest.mark.asyncio async def test_span_has_input_data(self) -> None: @@ -486,6 +507,7 @@ async def test_output_json_has_normalized_score_and_type(self) -> None: output = json.loads(span.attributes["output"]) assert output["type"] == 1 assert output["score"] == 85.0 # 0.85 normalized to 0-100 + assert output["evaluatorId"] == "mock-evaluator" @pytest.mark.asyncio async def test_justification_from_pydantic_details(self) -> None: @@ -506,6 +528,12 @@ class MockEvaluatorDetails(BaseModel): span.attributes["justification"] == "Agent output is semantically equivalent to expected output" ) + # justification also appears in the output JSON + output = json.loads(span.attributes["output"]) + assert ( + output["justification"] + == "Agent output is semantically equivalent to expected output" + ) @pytest.mark.asyncio async def test_justification_from_string_details(self) -> None: @@ -530,6 +558,9 @@ async def test_no_justification_when_no_details(self) -> None: span = tracer.get_spans_by_attr("span.type", "evalOutput")[0] assert "justification" not in span.attributes + # justification also absent from output JSON (exclude_none=True) + output = json.loads(span.attributes["output"]) + assert "justification" not in output class TestSpanHierarchy: