diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..a14ebd1a5eb2 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,13 @@ # Release History +## 1.17.1 (Unreleased) + +### Features Added + +- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. +- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 6703b2ca111f..f32ade1e90e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -34,6 +34,7 @@ from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator from ._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, @@ -135,6 +136,7 @@ def lazy_import(): "ToolCallAccuracyEvaluator", "_ToolOutputUtilizationEvaluator", "_ToolCallSuccessEvaluator", + "_ToolInputAccuracyEvaluator", "AzureOpenAIGrader", "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 44e0876bad68..f3ec39e3843e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( @@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Short-circuit: if the agent runtime already reported a failed tool + # execution via a known-failure ``status`` (e.g. "failed", "error", + # "incomplete"), deterministically return ``fail`` without calling the + # LLM. The evaluator's scoring contract is binary -- "FALSE: at least + # one tool call failed" -- and the prompty rubric doesn't see the + # ``status`` field, so it would otherwise grade only the (typically + # empty) result body and frequently mis-score the conversation as a + # pass. ``status`` is only populated by upstream converters that + # preserve it; absent ``status``, behavior is unchanged. + if isinstance(eval_input.get("response"), list): + failed_statuses = _collect_failed_tool_statuses(eval_input["response"]) + if failed_statuses: + reason = ( + "Detected failed tool execution(s) with status " + + ", ".join(sorted(set(failed_statuses))) + + ". Marked as fail without LLM grading." + ) + return { + self._result_key: 0.0, + f"{self._result_key}_score": 0.0, + f"{self._result_key}_passed": False, + f"{self._result_key}_result": "fail", + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": { + "short_circuit": "tool_status", + "failed_statuses": sorted(set(failed_statuses)), + }, + } + if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) @@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions +_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"}) + + +def _collect_failed_tool_statuses(agent_response_msgs): + """Return the list of failure statuses seen on any ``tool_call`` or + ``tool_result`` content block in ``agent_response_msgs``. + + Inputs are intentionally tolerated -- malformed messages / non-dict + content blocks are skipped rather than raised on, so this helper is safe + to call on freshly-deserialized agent traces. + + :param agent_response_msgs: The agent's response message list (already + validated to be a list by the caller). + :type agent_response_msgs: list + :return: A list (with duplicates preserved) of lowercased failure status + strings. Empty list means no failure signal was found. + :rtype: list[str] + """ + found = [] + if not isinstance(agent_response_msgs, list): + return found + for msg in agent_response_msgs: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") in ("tool_call", "tool_result"): + status = block.get("status") + if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES: + found.append(status.lower()) + return found + + def _get_tool_calls_results(agent_response_msgs): """Extract formatted agent tool calls and results from response.""" agent_response_text = [] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py index bae6c9895046..9dc1249dff60 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py @@ -3,4 +3,4 @@ # --------------------------------------------------------- # represents upcoming version -VERSION = "1.17.0" +VERSION = "1.17.1" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py new file mode 100644 index 000000000000..a4bfeb2e3e1a --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -0,0 +1,172 @@ +from unittest.mock import MagicMock + +import pytest + +from azure.ai.evaluation import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( + _collect_failed_tool_statuses, +) + + +# Default prompty mock that always grades as PASS. Tests that exercise the +# deterministic short-circuit path rely on this mock NOT being called. +async def _flow_pass(timeout, **kwargs): # pylint: disable=unused-argument + return { + "llm_output": { + "reason": "All tool calls completed successfully.", + "score": 1, + "properties": {}, + } + } + + +def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None): + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments or {"location": "NYC"}, + } + if status is not None: + block["status"] = status + return {"role": "assistant", "content": [block]} + + +def _tool_result(tool_call_id="call_1", result="72F sunny", status=None): + block = {"type": "tool_result", "tool_result": result} + if status is not None: + block["status"] = status + return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]} + + +# --------------------------------------------------------------------------- +# _collect_failed_tool_statuses +# --------------------------------------------------------------------------- + + +class TestCollectFailedToolStatuses: + def test_no_status_returns_empty(self): + msgs = [_assistant_tool_call(), _tool_result()] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_completed_status_returns_empty(self): + msgs = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + assert _collect_failed_tool_statuses(msgs) == [] + + @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"]) + def test_known_failure_status_on_tool_call_is_collected(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status] + + @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"]) + def test_failure_status_is_case_insensitive(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status.lower()] + + def test_failure_status_on_tool_result_is_collected(self): + msgs = [_assistant_tool_call(), _tool_result(status="failed")] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + def test_unknown_status_string_is_ignored(self): + msgs = [_assistant_tool_call(status="something_weird")] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_non_string_status_is_ignored(self): + msgs = [_assistant_tool_call(status=500)] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_malformed_inputs_are_tolerated(self): + # Non-list input + assert _collect_failed_tool_statuses(None) == [] + assert _collect_failed_tool_statuses("not a list") == [] + # List with non-dict items + dicts with non-list content + msgs = [ + "string entry", + 42, + {"role": "assistant"}, # no content + {"role": "assistant", "content": "not a list"}, + {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]}, + ] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + +# --------------------------------------------------------------------------- +# _do_eval short-circuit +# --------------------------------------------------------------------------- + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestToolCallSuccessShortCircuit: + def test_short_circuits_on_failed_tool_call_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(status="failed"), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success"] == 0.0 + assert result["tool_call_success_score"] == 0.0 + assert result["tool_call_success_passed"] is False + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_status"] == "completed" + assert "failed" in result["tool_call_success_reason"] + props = result["tool_call_success_properties"] + assert props["short_circuit"] == "tool_status" + assert props["failed_statuses"] == ["failed"] + + def test_short_circuits_on_failed_tool_result_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result(status="error")] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_properties"]["failed_statuses"] == ["error"] + + def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(tool_call_id="a", status="failed"), + _tool_result(tool_call_id="a", status="failed"), + _assistant_tool_call(tool_call_id="b", status="error"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + # Reason joins deduped, sorted statuses + assert "error, failed" in result["tool_call_success_reason"] + assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"] + + def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() # Goes to LLM + assert result["tool_call_success_passed"] is True + + def test_no_short_circuit_when_status_absent(self, mock_model_config): + """Back-compat: traces produced by converters that do not preserve + ``status`` continue to be graded by the LLM as before.""" + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() + assert result["tool_call_success_passed"] is True diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py new file mode 100644 index 000000000000..74bc352448a3 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -0,0 +1,183 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +""" +Regression tests for the change that lets the three tool evaluators +(ToolCallAccuracy, _ToolInputAccuracy, _ToolCallSuccess) accept +conversations containing restricted built-in tools. + +These evaluators previously rejected any conversation containing tools in +``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, +``azure_ai_search``). Because none of the three grades require the +(redacted) tool output body, the rejection has been lifted by setting +``check_for_unsupported_tools=False`` on each evaluator's input validator. + +The tests below exercise the validator directly so they do not need the +prompty flow or a real model deployment. They also confirm that the +underlying validator class still rejects restricted tools when +``check_for_unsupported_tools=True``, so the behavior change is limited +to the evaluator wiring. +""" + +import pytest + +from azure.ai.evaluation import ToolCallAccuracyEvaluator +from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator +from azure.ai.evaluation._evaluators._common._validators import ( + ToolCallsValidator, + ToolDefinitionsValidator, +) +from azure.ai.evaluation._exceptions import ErrorTarget, EvaluationException + + +RESTRICTED_TOOL_NAMES = [ + "bing_grounding", + "bing_custom_search", + "azure_ai_search", + "azure_fabric", + "sharepoint_grounding", +] + + +def _restricted_response(tool_name: str): + return [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": tool_name, + "arguments": {"query": "anything"}, + } + ], + } + ] + + +def _restricted_tool_definition(tool_name: str): + return { + "name": tool_name, + "description": f"Built-in {tool_name} tool.", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + } + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestRestrictedToolValidationLifted: + """Validator should no longer reject restricted tools for these three evaluators.""" + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + # Should not raise EvaluationException; flag flip made this path legal. + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_success_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config): + """Conversation containing both a function call and a restricted tool call validates cleanly.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Find stock price and weather.", + "response": [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_func", + "name": "get_weather", + "arguments": {"location": "Paris"}, + }, + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": "bing_grounding", + "arguments": {"query": "MSFT stock price"}, + }, + ], + } + ], + "tool_definitions": [ + { + "name": "get_weather", + "type": "function", + "description": "Weather lookup.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + _restricted_tool_definition("bing_grounding"), + ], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestUnderlyingValidatorUnchanged: + """The validator class itself still rejects restricted tools when the flag is on. + + Ensures the behavior change is limited to per-evaluator wiring; the validator + keeps its option to enforce the restricted-tool block for other consumers + (e.g. GroundednessEvaluator). + """ + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_calls_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolCallsValidator( + error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + check_for_unsupported_tools=True, + ) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolDefinitionsValidator( + error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, + requires_query=False, + check_for_unsupported_tools=True, + ) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value)