From b85ce989169c02ee9f428900970685cfe6352bcc Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Fri, 5 Jun 2026 08:16:27 -0700 Subject: [PATCH 1/7] Enable ToolCallAccuracy/Input/Success on restricted-tool conversations These three evaluators grade the agent's tool selection, input arguments, and call status -- none consume the (redacted) tool output body -- so the previous unconditional rejection of conversations containing built-in restricted tools (bing_grounding, bing_custom_search, azure_ai_search, azure_fabric, sharepoint_grounding) is now lifted. Implementation: - Set check_for_unsupported_tools=False on each evaluator's input validator in _tool_call_accuracy.py, _tool_input_accuracy.py, _tool_call_success.py. - The underlying ToolDefinitionsValidator / ToolCallsValidator classes are unchanged; GroundednessEvaluator and ToolOutputUtilizationEvaluator still reject restricted tools because they require the tool output body. Tests: - New test_unsupported_tools_validation.py (26 tests) covers: * 15 parametrized cases: each of the 3 evaluators x 5 restricted tools, asserting validate_eval_input returns True for response= payloads. * 1 mixed-tools case. * 10 regression cases asserting the underlying validators still reject restricted tools when check_for_unsupported_tools=True. Versioning: - Bumped _version.py 1.17.0 -> 1.17.1. - Added 1.17.1 (Unreleased) section to CHANGELOG.md under Features Added. --- .../azure-ai-evaluation/CHANGELOG.md | 6 + .../_tool_call_accuracy.py | 2 +- .../_tool_call_success/_tool_call_success.py | 2 +- .../_tool_input_accuracy.py | 2 +- .../azure/ai/evaluation/_version.py | 2 +- .../test_unsupported_tools_validation.py | 184 ++++++++++++++++++ 6 files changed, 194 insertions(+), 4 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..da63da61684b 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,11 @@ # Release History +## 1.17.1 (Unreleased) + +### Features Added + +- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 44e0876bad68..60a818b83274 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py index bae6c9895046..9dc1249dff60 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py @@ -3,4 +3,4 @@ # --------------------------------------------------------- # represents upcoming version -VERSION = "1.17.0" +VERSION = "1.17.1" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py new file mode 100644 index 000000000000..7c87bc34ee06 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -0,0 +1,184 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +""" +Regression tests for the change that lets the three tool evaluators +(ToolCallAccuracy, _ToolInputAccuracy, _ToolCallSuccess) accept +conversations containing restricted built-in tools. + +These evaluators previously rejected any conversation containing tools in +``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, +``azure_ai_search``). Because none of the three grades require the +(redacted) tool output body, the rejection has been lifted by setting +``check_for_unsupported_tools=False`` on each evaluator's input validator. + +The tests below exercise the validator directly so they do not need the +prompty flow or a real model deployment. They also confirm that the +underlying validator class still rejects restricted tools when +``check_for_unsupported_tools=True``, so the behavior change is limited +to the evaluator wiring. +""" + +import pytest + +from azure.ai.evaluation import ToolCallAccuracyEvaluator +from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator +from azure.ai.evaluation._evaluators._common._validators import ( + ToolCallsValidator, + ToolDefinitionsValidator, +) +from azure.ai.evaluation._exceptions import ErrorTarget, EvaluationException + + +RESTRICTED_TOOL_NAMES = [ + "bing_grounding", + "bing_custom_search", + "azure_ai_search", + "azure_fabric", + "sharepoint_grounding", +] + + +def _restricted_response(tool_name: str): + return [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": tool_name, + "arguments": {"query": "anything"}, + } + ], + } + ] + + +def _restricted_tool_definition(tool_name: str): + return { + "name": tool_name, + "description": f"Built-in {tool_name} tool.", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + } + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestRestrictedToolValidationLifted: + """Validator should no longer reject restricted tools for these three evaluators.""" + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + # Should not raise EvaluationException; flag flip made this path legal. + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_success_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config): + """Conversation containing both a function call and a restricted tool call validates cleanly.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Find stock price and weather.", + "response": [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_func", + "name": "get_weather", + "arguments": {"location": "Paris"}, + }, + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": "bing_grounding", + "arguments": {"query": "MSFT stock price"}, + }, + ], + } + ], + "tool_definitions": [ + { + "name": "get_weather", + "type": "function", + "description": "Weather lookup.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + _restricted_tool_definition("bing_grounding"), + ], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestUnderlyingValidatorUnchanged: + """The validator class itself still rejects restricted tools when the flag is on. + + Ensures the behavior change is limited to per-evaluator wiring; the validator + keeps its option to enforce the restricted-tool block for other consumers + (e.g. GroundednessEvaluator). + """ + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_calls_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolCallsValidator( + error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + check_for_unsupported_tools=True, + ) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolDefinitionsValidator( + error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, + requires_query=False, + check_for_unsupported_tools=True, + ) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + From af7b07ab3c2198a2610618fc3798407ecd246bae Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Fri, 5 Jun 2026 10:52:43 -0700 Subject: [PATCH 2/7] ToolCallSuccess: deterministic fail on runtime-reported failure status When any tool_call or tool_result in the response carries a known-failure status (failed, error, incomplete, cancelled/canceled), short-circuit _do_eval to return a deterministic fail result (score=0, _passed=False, _result='fail') without invoking the LLM. The evaluator's scoring contract is explicitly binary -- 'FALSE: at least one tool call failed' -- and the prompty rubric does not consider the status field, so it would otherwise grade only the (typically empty) result body and frequently mis-score failed conversations as passes. Reuses the existing pre-flow short-circuit pattern (_is_intermediate_response / _return_not_applicable_result) for consistency. Status is only populated by upstream converters that preserve it; absent status, behavior is unchanged. Bumps to 1.17.1, adds CHANGELOG entry, and adds 19 focused unit tests. --- .../azure-ai-evaluation/CHANGELOG.md | 1 + .../_tool_call_success/_tool_call_success.py | 68 +++++++ .../test_tool_call_success_evaluator.py | 172 ++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index da63da61684b..698d05033a9a 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features Added - Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 60a818b83274..f3ec39e3843e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Short-circuit: if the agent runtime already reported a failed tool + # execution via a known-failure ``status`` (e.g. "failed", "error", + # "incomplete"), deterministically return ``fail`` without calling the + # LLM. The evaluator's scoring contract is binary -- "FALSE: at least + # one tool call failed" -- and the prompty rubric doesn't see the + # ``status`` field, so it would otherwise grade only the (typically + # empty) result body and frequently mis-score the conversation as a + # pass. ``status`` is only populated by upstream converters that + # preserve it; absent ``status``, behavior is unchanged. + if isinstance(eval_input.get("response"), list): + failed_statuses = _collect_failed_tool_statuses(eval_input["response"]) + if failed_statuses: + reason = ( + "Detected failed tool execution(s) with status " + + ", ".join(sorted(set(failed_statuses))) + + ". Marked as fail without LLM grading." + ) + return { + self._result_key: 0.0, + f"{self._result_key}_score": 0.0, + f"{self._result_key}_passed": False, + f"{self._result_key}_result": "fail", + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": { + "short_circuit": "tool_status", + "failed_statuses": sorted(set(failed_statuses)), + }, + } + if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) @@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions +_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"}) + + +def _collect_failed_tool_statuses(agent_response_msgs): + """Return the list of failure statuses seen on any ``tool_call`` or + ``tool_result`` content block in ``agent_response_msgs``. + + Inputs are intentionally tolerated -- malformed messages / non-dict + content blocks are skipped rather than raised on, so this helper is safe + to call on freshly-deserialized agent traces. + + :param agent_response_msgs: The agent's response message list (already + validated to be a list by the caller). + :type agent_response_msgs: list + :return: A list (with duplicates preserved) of lowercased failure status + strings. Empty list means no failure signal was found. + :rtype: list[str] + """ + found = [] + if not isinstance(agent_response_msgs, list): + return found + for msg in agent_response_msgs: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") in ("tool_call", "tool_result"): + status = block.get("status") + if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES: + found.append(status.lower()) + return found + + def _get_tool_calls_results(agent_response_msgs): """Extract formatted agent tool calls and results from response.""" agent_response_text = [] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py new file mode 100644 index 000000000000..a4bfeb2e3e1a --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -0,0 +1,172 @@ +from unittest.mock import MagicMock + +import pytest + +from azure.ai.evaluation import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( + _collect_failed_tool_statuses, +) + + +# Default prompty mock that always grades as PASS. Tests that exercise the +# deterministic short-circuit path rely on this mock NOT being called. +async def _flow_pass(timeout, **kwargs): # pylint: disable=unused-argument + return { + "llm_output": { + "reason": "All tool calls completed successfully.", + "score": 1, + "properties": {}, + } + } + + +def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None): + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments or {"location": "NYC"}, + } + if status is not None: + block["status"] = status + return {"role": "assistant", "content": [block]} + + +def _tool_result(tool_call_id="call_1", result="72F sunny", status=None): + block = {"type": "tool_result", "tool_result": result} + if status is not None: + block["status"] = status + return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]} + + +# --------------------------------------------------------------------------- +# _collect_failed_tool_statuses +# --------------------------------------------------------------------------- + + +class TestCollectFailedToolStatuses: + def test_no_status_returns_empty(self): + msgs = [_assistant_tool_call(), _tool_result()] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_completed_status_returns_empty(self): + msgs = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + assert _collect_failed_tool_statuses(msgs) == [] + + @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"]) + def test_known_failure_status_on_tool_call_is_collected(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status] + + @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"]) + def test_failure_status_is_case_insensitive(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status.lower()] + + def test_failure_status_on_tool_result_is_collected(self): + msgs = [_assistant_tool_call(), _tool_result(status="failed")] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + def test_unknown_status_string_is_ignored(self): + msgs = [_assistant_tool_call(status="something_weird")] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_non_string_status_is_ignored(self): + msgs = [_assistant_tool_call(status=500)] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_malformed_inputs_are_tolerated(self): + # Non-list input + assert _collect_failed_tool_statuses(None) == [] + assert _collect_failed_tool_statuses("not a list") == [] + # List with non-dict items + dicts with non-list content + msgs = [ + "string entry", + 42, + {"role": "assistant"}, # no content + {"role": "assistant", "content": "not a list"}, + {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]}, + ] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + +# --------------------------------------------------------------------------- +# _do_eval short-circuit +# --------------------------------------------------------------------------- + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestToolCallSuccessShortCircuit: + def test_short_circuits_on_failed_tool_call_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(status="failed"), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success"] == 0.0 + assert result["tool_call_success_score"] == 0.0 + assert result["tool_call_success_passed"] is False + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_status"] == "completed" + assert "failed" in result["tool_call_success_reason"] + props = result["tool_call_success_properties"] + assert props["short_circuit"] == "tool_status" + assert props["failed_statuses"] == ["failed"] + + def test_short_circuits_on_failed_tool_result_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result(status="error")] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_properties"]["failed_statuses"] == ["error"] + + def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(tool_call_id="a", status="failed"), + _tool_result(tool_call_id="a", status="failed"), + _assistant_tool_call(tool_call_id="b", status="error"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + # Reason joins deduped, sorted statuses + assert "error, failed" in result["tool_call_success_reason"] + assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"] + + def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() # Goes to LLM + assert result["tool_call_success_passed"] is True + + def test_no_short_circuit_when_status_absent(self, mock_model_config): + """Back-compat: traces produced by converters that do not preserve + ``status`` continue to be graded by the LLM as before.""" + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() + assert result["tool_call_success_passed"] is True From 89f41295a064533914561c0ecc1f1b7d6e11f984 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 8 Jun 2026 10:41:15 -0700 Subject: [PATCH 3/7] Export _ToolInputAccuracyEvaluator from azure.ai.evaluation top-level namespace Brings _ToolInputAccuracyEvaluator in line with its three sibling tool evaluators (ToolCallAccuracyEvaluator, _ToolCallSuccessEvaluator, _ToolOutputUtilizationEvaluator) which are already exposed on the top-level package. Consumers (notably the Foundry evaluations service catalog) can now import it from azure.ai.evaluation directly instead of reaching into the private _evaluators._tool_input_accuracy submodule. --- sdk/evaluation/azure-ai-evaluation/CHANGELOG.md | 1 + .../azure-ai-evaluation/azure/ai/evaluation/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 698d05033a9a..a14ebd1a5eb2 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features Added - Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. - `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 6703b2ca111f..f32ade1e90e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -34,6 +34,7 @@ from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator from ._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, @@ -135,6 +136,7 @@ def lazy_import(): "ToolCallAccuracyEvaluator", "_ToolOutputUtilizationEvaluator", "_ToolCallSuccessEvaluator", + "_ToolInputAccuracyEvaluator", "AzureOpenAIGrader", "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", From 24198a3944c2290b1bfdbae4f5214a7601664304 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 8 Jun 2026 12:02:21 -0700 Subject: [PATCH 4/7] Black: drop trailing blank line in test_unsupported_tools_validation.py --- .../tests/unittests/test_unsupported_tools_validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py index 7c87bc34ee06..74bc352448a3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -181,4 +181,3 @@ def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_n with pytest.raises(EvaluationException) as exc_info: validator.validate_eval_input(eval_input) assert "currently not supported" in str(exc_info.value) - From dabe4e9a8a9c5c3c631854964dcee9017015e9db Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Tue, 9 Jun 2026 15:17:37 -0700 Subject: [PATCH 5/7] ToolCallSuccess: pivot from Python short-circuit to [STATUS] pass-through Mirrors azureml-assets PR #5126 design pivot. Source (_tool_call_success.py): - Reverted check_for_unsupported_tools True->False flip; TCS again rejects restricted-tool conversations (its rubric depends on the tool output body). - Removed _FAILED_TOOL_STATUSES + _collect_failed_tool_statuses helper and the pre-LLM deterministic-fail short-circuit. Status interpretation is now an LLM-only concern. - Added _format_status_suffix helper and wired it into _get_tool_calls_results so every [TOOL_CALL] / [TOOL_RESULT] line carries a [STATUS] suffix when the source content block has a status field. Back-compat preserved: empty/None/non-string status emits ''; output is byte-identical to the prior format when status is absent. Prompty (tool_call_success.prompty): - Added a [STATUS] failed|error|incomplete|cancelled|canceled bullet to ERROR-CASES marking it an authoritative failure signal that overrides bland payload appearance. - Added an explicit clause that [STATUS] is optional and that [STATUS] completed does not by itself imply success (payload rules still apply). - Added 3 illustrative examples: bland-payload+failed-status, completed-status+error-payload, and a parallel-call topology with one failed. Tests: - Replaced test_tool_call_success_evaluator.py with status-passthrough coverage (12 tests on _format_status_suffix + _get_tool_calls_results topologies). - Flipped test_tool_call_success_accepts_restricted_tool to test_tool_call_success_still_rejects_restricted_tool in test_unsupported_tools_validation.py and updated module docstring scope to TCA/TIA only. Changelog: rewrote 1.17.1 entry to reflect TCA/TIA enablement + TCS [STATUS] pass-through (validator flip deferred to a later phase). All 38 impacted unit tests pass. --- .../azure-ai-evaluation/CHANGELOG.md | 4 +- .../_tool_call_success/_tool_call_success.py | 95 ++---- .../tool_call_success.prompty | 57 ++++ .../test_tool_call_success_evaluator.py | 296 ++++++++++-------- .../test_unsupported_tools_validation.py | 31 +- 5 files changed, 270 insertions(+), 213 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index a14ebd1a5eb2..2783023835c7 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -4,9 +4,9 @@ ### Features Added -- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- Enabled `ToolCallAccuracyEvaluator` and `_ToolInputAccuracyEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). Both evaluators grade the agent's tool selection and input arguments — neither requires the (often redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `_ToolCallSuccessEvaluator`, `GroundednessEvaluator`, and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because their rubrics consume the tool output body. - Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. -- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. +- `_ToolCallSuccessEvaluator` now forwards the per-call runtime `status` (e.g. `failed`, `error`, `incomplete`, `cancelled`, `canceled`, `completed`) to the LLM rubric as a `[STATUS] ` annotation appended to each emitted `[TOOL_CALL]` / `[TOOL_RESULT]` line. The prompty rubric is updated to treat the failure annotations as a strong, authoritative failure signal that overrides a bland or otherwise-passing-looking payload, while still falling back to payload-only judgment when `status` is absent. Output is byte-identical to the previous wire format when no `status` field is populated, so existing recorded test fixtures and customers whose converters do not emit `status` are unaffected. ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index f3ec39e3843e..a66af1b48141 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, - check_for_unsupported_tools=False, + check_for_unsupported_tools=True, ) super().__init__( @@ -179,37 +179,6 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) - # Short-circuit: if the agent runtime already reported a failed tool - # execution via a known-failure ``status`` (e.g. "failed", "error", - # "incomplete"), deterministically return ``fail`` without calling the - # LLM. The evaluator's scoring contract is binary -- "FALSE: at least - # one tool call failed" -- and the prompty rubric doesn't see the - # ``status`` field, so it would otherwise grade only the (typically - # empty) result body and frequently mis-score the conversation as a - # pass. ``status`` is only populated by upstream converters that - # preserve it; absent ``status``, behavior is unchanged. - if isinstance(eval_input.get("response"), list): - failed_statuses = _collect_failed_tool_statuses(eval_input["response"]) - if failed_statuses: - reason = ( - "Detected failed tool execution(s) with status " - + ", ".join(sorted(set(failed_statuses))) - + ". Marked as fail without LLM grading." - ) - return { - self._result_key: 0.0, - f"{self._result_key}_score": 0.0, - f"{self._result_key}_passed": False, - f"{self._result_key}_result": "fail", - f"{self._result_key}_reason": reason, - f"{self._result_key}_status": "completed", - f"{self._result_key}_threshold": self._threshold, - f"{self._result_key}_properties": { - "short_circuit": "tool_status", - "failed_statuses": sorted(set(failed_statuses)), - }, - } - if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) @@ -302,45 +271,35 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions -_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"}) - +def _format_status_suffix(status): + """Build the trailing ``[STATUS] `` annotation for a content block. -def _collect_failed_tool_statuses(agent_response_msgs): - """Return the list of failure statuses seen on any ``tool_call`` or - ``tool_result`` content block in ``agent_response_msgs``. + Returns the empty string when ``status`` is absent or not a non-empty + string, so callers can unconditionally concatenate the return value + without affecting back-compat output. - Inputs are intentionally tolerated -- malformed messages / non-dict - content blocks are skipped rather than raised on, so this helper is safe - to call on freshly-deserialized agent traces. - - :param agent_response_msgs: The agent's response message list (already - validated to be a list by the caller). - :type agent_response_msgs: list - :return: A list (with duplicates preserved) of lowercased failure status - strings. Empty list means no failure signal was found. - :rtype: list[str] + :param status: The raw ``status`` field from a ``tool_call`` or + ``tool_result`` content block. + :type status: Any + :return: ``" [STATUS] "`` when ``status`` is a non-empty string, + otherwise ``""``. + :rtype: str """ - found = [] - if not isinstance(agent_response_msgs, list): - return found - for msg in agent_response_msgs: - if not isinstance(msg, dict): - continue - content = msg.get("content") - if not isinstance(content, list): - continue - for block in content: - if not isinstance(block, dict): - continue - if block.get("type") in ("tool_call", "tool_result"): - status = block.get("status") - if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES: - found.append(status.lower()) - return found + if isinstance(status, str) and status: + return f" [STATUS] {status}" + return "" def _get_tool_calls_results(agent_response_msgs): - """Extract formatted agent tool calls and results from response.""" + """Extract formatted agent tool calls and results from response. + + Each emitted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line is suffixed with + ``[STATUS] `` when the source content block carries a ``status`` + field. The prompty rubric uses this annotation as a strong failure signal + (see ``tool_call_success.prompty``). When ``status`` is absent the suffix + is omitted and the rubric falls back to payload-only judgment, so the + formatted output is byte-identical to the pre-pass-through wire format. + """ agent_response_text = [] tool_results = {} @@ -351,7 +310,8 @@ def _get_tool_calls_results(agent_response_msgs): for content in msg.get("content", []): if content.get("type") == "tool_result": result = content.get("tool_result") - tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}" + status_suffix = _format_status_suffix(content.get("status")) + tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}{status_suffix}" # Second pass: parse assistant messages and tool calls for msg in agent_response_msgs: @@ -370,7 +330,8 @@ def _get_tool_calls_results(agent_response_msgs): func_name = content.get("name", "") args = content.get("arguments", {}) args_str = ", ".join(f'{k}="{v}"' for k, v in args.items()) - call_line = f"[TOOL_CALL] {func_name}({args_str})" + status_suffix = _format_status_suffix(content.get("status")) + call_line = f"[TOOL_CALL] {func_name}({args_str}){status_suffix}" agent_response_text.append(call_line) if tool_call_id in tool_results: agent_response_text.append(tool_results[tool_call_id]) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty index d7df87a2004d..1c9b5264d270 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty @@ -53,6 +53,7 @@ B. Examine tool result and definition for the tool being called to check whether 1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it: ERROR-CASES: =========== + - The tool call or tool result line is annotated with **`[STATUS] failed`**, **`[STATUS] error`**, **`[STATUS] incomplete`**, **`[STATUS] cancelled`**, or **`[STATUS] canceled`**. This annotation is emitted by the runtime when it caught a technical failure (e.g. timeout, exception, host cancellation). It is a strong, authoritative failure signal and overrides any contradictory appearance of the result payload. The annotation is case-insensitive. - The tool call resulted in an error or exception - The tool call failed to run or failed to return - The tool call returned a result that indicates an error or failure @@ -60,6 +61,7 @@ B. Examine tool result and definition for the tool being called to check whether - The tool timed-out or returned a result that indicate a time-out - The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present 2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake + 3. The `[STATUS]` annotation is **optional**. When it is absent on a tool call, judge that call by the payload-based rules above (back-compat with runtimes that do not emit a status field). When it is present and indicates success (e.g. `[STATUS] completed`), it does not by itself make a call succeed -- still apply the payload-based rules, because a runtime can report `completed` while the tool itself returned an error payload. C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded** D. You are required to return your **output** in the following format: { @@ -335,6 +337,61 @@ EXPECTED OUTPUT } +### Example - Failed (status annotation overrides bland payload) + +[TOOL_CALLS] +[TOOL_CALL] send_email(to:"alice@example.com" , body:"hi") [STATUS] failed +[TOOL_RESULT] {} [STATUS] failed + +EXPECTED OUTPUT +{ + "reason": "send_email is annotated with [STATUS] failed on both the call and the result, which is an authoritative failure signal from the runtime even though the result body {} is otherwise inconclusive", + "properties": { + "failed_tools": "send_email" + }, + "score": 0, + "status": "completed" +} + + +### Example - Failed (status completed but payload still indicates an error) + +[TOOL_CALLS] +[TOOL_CALL] get_current_user_info() [STATUS] completed +[TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"} [STATUS] completed + +EXPECTED OUTPUT +{ + "reason": "The runtime reported [STATUS] completed but the result payload still indicates failure with empty fields and an explicit error message. Payload-based rules still apply when [STATUS] is completed -- this call is failed", + "properties": { + "failed_tools": "get_current_user_info" + }, + "score": 0, + "status": "completed" +} + + +### Example - Failed (parallel calls in one turn, one annotated failed) + +[TOOL_CALLS] +[TOOL_CALL] fetch_weather(city:"Seattle") [STATUS] completed +[TOOL_RESULT] {"temp": 62} [STATUS] completed +[TOOL_CALL] send_email(to:"x@example.com") [STATUS] failed +[TOOL_RESULT] {} [STATUS] failed +[TOOL_CALL] lookup_user(id:"u42") [STATUS] completed +[TOOL_RESULT] {"user_id": "u42"} [STATUS] completed + +EXPECTED OUTPUT +{ + "reason": "send_email is annotated with [STATUS] failed; the other two parallel calls succeeded but a single failed call is sufficient to fail the overall evaluation", + "properties": { + "failed_tools": "send_email" + }, + "score": 0, + "status": "completed" +} + + Now given the **INPUT** you received generate the output # Output diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py index a4bfeb2e3e1a..6d0a558921f3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -1,172 +1,202 @@ -from unittest.mock import MagicMock +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Tests for ToolCallSuccess runtime status pass-through to the LLM rubric. + +The evaluator's source-side preprocessing emits ``[STATUS] `` annotations +on each formatted ``[TOOL_CALL]`` / ``[TOOL_RESULT]`` line whenever the source +content block carries a ``status`` field. The prompty rubric is taught to treat +these annotations as a strong (authoritative) failure signal when the status is +in {failed, error, incomplete, cancelled, canceled}, and to fall back to +payload-only judgment when ``status`` is absent. + +These tests cover the source-side preprocessing only (the ``[STATUS]`` string +emission). End-to-end rubric behavior is covered by the existing behavior +suites that exercise the full evaluator with a mocked LLM. +""" import pytest -from azure.ai.evaluation import _ToolCallSuccessEvaluator from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( - _collect_failed_tool_statuses, + _format_status_suffix, + _get_tool_calls_results, ) -# Default prompty mock that always grades as PASS. Tests that exercise the -# deterministic short-circuit path rely on this mock NOT being called. -async def _flow_pass(timeout, **kwargs): # pylint: disable=unused-argument - return { - "llm_output": { - "reason": "All tool calls completed successfully.", - "score": 1, - "properties": {}, - } - } +# region helpers -def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None): +def _assistant_tool_call(tool_call_id, name, arguments, status=None): + """Build an assistant message carrying a single tool_call content block.""" block = { "type": "tool_call", "tool_call_id": tool_call_id, "name": name, - "arguments": arguments or {"location": "NYC"}, + "arguments": arguments, } if status is not None: block["status"] = status return {"role": "assistant", "content": [block]} -def _tool_result(tool_call_id="call_1", result="72F sunny", status=None): - block = {"type": "tool_result", "tool_result": result} +def _tool_result(tool_call_id, result, status=None): + """Build a tool message carrying a single tool_result content block.""" + block = { + "type": "tool_result", + "tool_call_id": tool_call_id, + "tool_result": result, + } if status is not None: block["status"] = status - return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]} + return { + "role": "tool", + "tool_call_id": tool_call_id, + "content": [block], + } -# --------------------------------------------------------------------------- -# _collect_failed_tool_statuses -# --------------------------------------------------------------------------- +def _assistant_parallel_tool_calls(blocks): + """Build a single assistant message that emits multiple tool_call blocks in one turn. + + ``blocks`` is a list of ``(tool_call_id, name, arguments, status)`` tuples. + This is the modern Responses-API topology for parallel function-call + invocation: multiple ``tool_call`` content blocks under one assistant + message, in contrast to one assistant message per call. + """ + content = [] + for tool_call_id, name, arguments, status in blocks: + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments, + } + if status is not None: + block["status"] = status + content.append(block) + return {"role": "assistant", "content": content} -class TestCollectFailedToolStatuses: - def test_no_status_returns_empty(self): - msgs = [_assistant_tool_call(), _tool_result()] - assert _collect_failed_tool_statuses(msgs) == [] +# endregion - def test_completed_status_returns_empty(self): - msgs = [ - _assistant_tool_call(status="completed"), - _tool_result(status="completed"), - ] - assert _collect_failed_tool_statuses(msgs) == [] - - @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"]) - def test_known_failure_status_on_tool_call_is_collected(self, status): - msgs = [_assistant_tool_call(status=status)] - assert _collect_failed_tool_statuses(msgs) == [status] - - @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"]) - def test_failure_status_is_case_insensitive(self, status): - msgs = [_assistant_tool_call(status=status)] - assert _collect_failed_tool_statuses(msgs) == [status.lower()] - - def test_failure_status_on_tool_result_is_collected(self): - msgs = [_assistant_tool_call(), _tool_result(status="failed")] - assert _collect_failed_tool_statuses(msgs) == ["failed"] - - def test_unknown_status_string_is_ignored(self): - msgs = [_assistant_tool_call(status="something_weird")] - assert _collect_failed_tool_statuses(msgs) == [] - - def test_non_string_status_is_ignored(self): - msgs = [_assistant_tool_call(status=500)] - assert _collect_failed_tool_statuses(msgs) == [] - - def test_malformed_inputs_are_tolerated(self): - # Non-list input - assert _collect_failed_tool_statuses(None) == [] - assert _collect_failed_tool_statuses("not a list") == [] - # List with non-dict items + dicts with non-list content - msgs = [ - "string entry", - 42, - {"role": "assistant"}, # no content - {"role": "assistant", "content": "not a list"}, - {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]}, - ] - assert _collect_failed_tool_statuses(msgs) == ["failed"] +@pytest.mark.unittest +class TestFormatStatusSuffix: + """Unit tests for the ``_format_status_suffix`` helper.""" + + def test_known_failure_status_emits_suffix(self): + """A known-failure status string produces a ``[STATUS] `` suffix.""" + assert _format_status_suffix("failed") == " [STATUS] failed" + + def test_completed_status_emits_suffix(self): + """A success status string also emits a suffix (the rubric distinguishes the two).""" + assert _format_status_suffix("completed") == " [STATUS] completed" + + def test_arbitrary_status_string_emits_suffix(self): + """Any non-empty string status emits a suffix; the rubric judges semantics, not Python.""" + assert _format_status_suffix("rate_limited") == " [STATUS] rate_limited" -# --------------------------------------------------------------------------- -# _do_eval short-circuit -# --------------------------------------------------------------------------- + def test_none_status_emits_empty(self): + """Absent status (``None``) emits the empty string for back-compat.""" + assert _format_status_suffix(None) == "" + + def test_empty_string_status_emits_empty(self): + """Empty string status emits the empty string (treated same as absent).""" + assert _format_status_suffix("") == "" + + def test_non_string_status_emits_empty(self): + """Non-string statuses (int, dict, list) are ignored rather than raised on.""" + assert _format_status_suffix(42) == "" + assert _format_status_suffix({"x": 1}) == "" + assert _format_status_suffix(["failed"]) == "" -@pytest.mark.usefixtures("mock_model_config") @pytest.mark.unittest -class TestToolCallSuccessShortCircuit: - def test_short_circuits_on_failed_tool_call_status(self, mock_model_config): - evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) - evaluator._flow = MagicMock(side_effect=_flow_pass) - - response = [_assistant_tool_call(status="failed"), _tool_result()] - result = evaluator(response=response) - - evaluator._flow.assert_not_called() - assert result["tool_call_success"] == 0.0 - assert result["tool_call_success_score"] == 0.0 - assert result["tool_call_success_passed"] is False - assert result["tool_call_success_result"] == "fail" - assert result["tool_call_success_status"] == "completed" - assert "failed" in result["tool_call_success_reason"] - props = result["tool_call_success_properties"] - assert props["short_circuit"] == "tool_status" - assert props["failed_statuses"] == ["failed"] - - def test_short_circuits_on_failed_tool_result_status(self, mock_model_config): - evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) - evaluator._flow = MagicMock(side_effect=_flow_pass) - - response = [_assistant_tool_call(), _tool_result(status="error")] - result = evaluator(response=response) - - evaluator._flow.assert_not_called() - assert result["tool_call_success_result"] == "fail" - assert result["tool_call_success_properties"]["failed_statuses"] == ["error"] - - def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config): - evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) - evaluator._flow = MagicMock(side_effect=_flow_pass) - - response = [ - _assistant_tool_call(tool_call_id="a", status="failed"), - _tool_result(tool_call_id="a", status="failed"), - _assistant_tool_call(tool_call_id="b", status="error"), - ] - result = evaluator(response=response) +class TestGetToolCallsResultsStatusPassthrough: + """Integration tests for ``[STATUS]`` annotation emission via ``_get_tool_calls_results``.""" - evaluator._flow.assert_not_called() - # Reason joins deduped, sorted statuses - assert "error, failed" in result["tool_call_success_reason"] - assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"] + def test_status_on_tool_call_is_appended_to_tool_call_line(self): + """When ``status`` is set on a tool_call block, the ``[TOOL_CALL]`` line carries the annotation.""" + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}, status="failed"), + _tool_result("c1", ""), + ] + lines = _get_tool_calls_results(msgs) + assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed' + # Tool result has no status -> no suffix. + assert lines[1] == "[TOOL_RESULT] " - def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config): - evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) - evaluator._flow = MagicMock(side_effect=_flow_pass) + def test_status_on_tool_result_is_appended_to_tool_result_line(self): + """When ``status`` is set on a tool_result block, the ``[TOOL_RESULT]`` line carries the annotation.""" + msgs = [ + _assistant_tool_call("c1", "send_email", {"to": "x@example.com"}), + _tool_result("c1", "", status="error"), + ] + lines = _get_tool_calls_results(msgs) + assert lines[0] == '[TOOL_CALL] send_email(to="x@example.com")' + assert lines[1] == "[TOOL_RESULT] [STATUS] error" - response = [ - _assistant_tool_call(status="completed"), - _tool_result(status="completed"), + def test_completed_status_is_passed_through_too(self): + """``[STATUS] completed`` is emitted alongside failure statuses; the rubric decides semantics.""" + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), + _tool_result("c1", "Sunny, 72F.", status="completed"), ] - result = evaluator(response=response) + lines = _get_tool_calls_results(msgs) + assert lines[0] == '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed' + assert lines[1] == "[TOOL_RESULT] Sunny, 72F. [STATUS] completed" - evaluator._flow.assert_called_once() # Goes to LLM - assert result["tool_call_success_passed"] is True + def test_absent_status_produces_no_suffix_back_compat(self): + """When ``status`` is absent on every block, output matches the pre-status-pass-through format exactly.""" + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}), + _tool_result("c1", "Sunny, 72F."), + ] + lines = _get_tool_calls_results(msgs) + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle")', + "[TOOL_RESULT] Sunny, 72F.", + ] - def test_no_short_circuit_when_status_absent(self, mock_model_config): - """Back-compat: traces produced by converters that do not preserve - ``status`` continue to be graded by the LLM as before.""" - evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) - evaluator._flow = MagicMock(side_effect=_flow_pass) + def test_parallel_tool_calls_in_one_assistant_message_each_get_their_own_status(self): + """Multiple ``tool_call`` blocks in one assistant message each emit their own ``[STATUS]`` annotation. - response = [_assistant_tool_call(), _tool_result()] - result = evaluator(response=response) + This is the modern Responses-API topology and exercises that the + formatter walks into the content list rather than only processing the + first block per message. + """ + msgs = [ + _assistant_parallel_tool_calls([ + ("c1", "fetch_weather", {"city": "Seattle"}, "completed"), + ("c2", "send_email", {"to": "x@example.com"}, "failed"), + ("c3", "lookup_user", {"id": "u42"}, "completed"), + ]), + _tool_result("c1", "Sunny, 72F.", status="completed"), + _tool_result("c2", "", status="failed"), + _tool_result("c3", {"user_id": "u42"}, status="completed"), + ] + lines = _get_tool_calls_results(msgs) + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed', + "[TOOL_RESULT] Sunny, 72F. [STATUS] completed", + '[TOOL_CALL] send_email(to="x@example.com") [STATUS] failed', + "[TOOL_RESULT] [STATUS] failed", + '[TOOL_CALL] lookup_user(id="u42") [STATUS] completed', + "[TOOL_RESULT] {'user_id': 'u42'} [STATUS] completed", + ] - evaluator._flow.assert_called_once() - assert result["tool_call_success_passed"] is True + def test_mixed_status_present_and_absent_across_calls(self): + """A response with status on some calls and not others produces a mixed-suffix output.""" + msgs = [ + _assistant_tool_call("c1", "fetch_weather", {"city": "Seattle"}, status="completed"), + _tool_result("c1", "Sunny, 72F."), + _assistant_tool_call("c2", "send_email", {"to": "x@example.com"}), + _tool_result("c2", "", status="failed"), + ] + lines = _get_tool_calls_results(msgs) + assert lines == [ + '[TOOL_CALL] fetch_weather(city="Seattle") [STATUS] completed', + "[TOOL_RESULT] Sunny, 72F.", + '[TOOL_CALL] send_email(to="x@example.com")', + "[TOOL_RESULT] [STATUS] failed", + ] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py index 74bc352448a3..2250e343a3d2 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -2,21 +2,27 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- """ -Regression tests for the change that lets the three tool evaluators -(ToolCallAccuracy, _ToolInputAccuracy, _ToolCallSuccess) accept -conversations containing restricted built-in tools. - -These evaluators previously rejected any conversation containing tools in -``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, -``azure_ai_search``). Because none of the three grades require the -(redacted) tool output body, the rejection has been lifted by setting +Regression tests for the change that lets two tool evaluators +(``ToolCallAccuracy`` and ``_ToolInputAccuracy``) accept conversations +containing restricted built-in tools. + +These two evaluators previously rejected any conversation containing tools +in ``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, +``azure_ai_search``). Because neither grade requires the (often redacted) +tool output body, the rejection has been lifted by setting ``check_for_unsupported_tools=False`` on each evaluator's input validator. +``_ToolCallSuccess`` is intentionally **not** part of this enablement: its +rubric still depends on the tool output body to judge success, so it keeps +``check_for_unsupported_tools=True`` and continues to reject restricted +tools. Coverage for that contract lives in this file alongside the TCA/TIA +acceptance tests so the two stay in lockstep. + The tests below exercise the validator directly so they do not need the prompty flow or a real model deployment. They also confirm that the underlying validator class still rejects restricted tools when ``check_for_unsupported_tools=True``, so the behavior change is limited -to the evaluator wiring. +to the per-evaluator wiring. """ import pytest @@ -94,13 +100,16 @@ def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, to assert evaluator._validator.validate_eval_input(eval_input) is True @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) - def test_tool_call_success_accepts_restricted_tool(self, mock_model_config, tool_name): + def test_tool_call_success_still_rejects_restricted_tool(self, mock_model_config, tool_name): + """TCS keeps the restricted-tool block (its rubric depends on the tool output body).""" evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) eval_input = { "response": _restricted_response(tool_name), "tool_definitions": [_restricted_tool_definition(tool_name)], } - assert evaluator._validator.validate_eval_input(eval_input) is True + with pytest.raises(EvaluationException) as exc_info: + evaluator._validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config): """Conversation containing both a function call and a restricted tool call validates cleanly.""" From 0b5acca2a3ab3b0045a5eac8b81776eb011cc1ff Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Tue, 9 Jun 2026 15:36:46 -0700 Subject: [PATCH 6/7] TCS prompty: tighten [STATUS] bullet to match Responses-API enum reality The previous wording listed all five failure values (failed, error, incomplete, cancelled, canceled) as if any runtime emitted them, and claimed the annotation is case-insensitive. Per the Responses-API tool-call status enum (in_progress | completed | incomplete | failed), only 'failed' and 'incomplete' are ever emitted by the platform; the other three are reserved for non-Responses-API runtimes. Case-insensitivity was never enforced by _format_status_suffix (status is forwarded verbatim) and the API contract is lowercase regardless. New wording: foregrounds 'failed' and 'incomplete' as the primary values, parenthesizes the other three as non-Responses-API future-proofing, separates the two failure causes (runtime caught a technical failure vs. call interrupted before completion -> incomplete), and drops the case-insensitivity claim. No behavior change in the helper; rubric language only. --- .../_evaluators/_tool_call_success/tool_call_success.prompty | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty index 1c9b5264d270..00133f70ef31 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty @@ -53,7 +53,7 @@ B. Examine tool result and definition for the tool being called to check whether 1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it: ERROR-CASES: =========== - - The tool call or tool result line is annotated with **`[STATUS] failed`**, **`[STATUS] error`**, **`[STATUS] incomplete`**, **`[STATUS] cancelled`**, or **`[STATUS] canceled`**. This annotation is emitted by the runtime when it caught a technical failure (e.g. timeout, exception, host cancellation). It is a strong, authoritative failure signal and overrides any contradictory appearance of the result payload. The annotation is case-insensitive. + - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`** (or, for non-Responses-API runtimes, with `[STATUS] error`, `[STATUS] cancelled`, or `[STATUS] canceled`). These annotations indicate the tool call did not produce a usable result -- either because the runtime caught a technical failure (e.g. an exception in the tool, the API surface explicitly marked the call `failed`) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload. - The tool call resulted in an error or exception - The tool call failed to run or failed to return - The tool call returned a result that indicates an error or failure From 62c8f449b086fe72a15fa90ceb7eb5c1f03e8e5f Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Tue, 9 Jun 2026 15:43:41 -0700 Subject: [PATCH 7/7] TCS prompty: drop speculative [STATUS] vocabulary; keep failed + incomplete only Walked the runtime surface area: Responses API enum is in_progress | completed | incomplete | failed; Threads/v1 Agents API has 'cancelled' on runs but no SDK converter lifts run-status onto individual tool_call blocks; ACA trace converter maps OTel status_code to the Responses-API vocabulary (Ok -> completed, Error -> failed) rather than preserving 'cancelled'/'error' verbatim; tool-server gRPC StatusCodes are server-side only and never reach the eval row. No emitter today produces error | cancelled | canceled on a tool_call block, so listing them as recognized [STATUS] values overstates the spec and adds rubric noise for vocabulary the LLM will never see. The _format_status_suffix helper stays permissive (still accepts any non-empty string for forward-compat); only the rubric wording is narrowed. Keeps 'incomplete' as authoritative failure: it explicitly means the tool call did not produce a usable result (host timeout, parent-response cancellation, max_tokens cut-off mid-call), which matches the binary 'did the tool call succeed' contract. 'in_progress' is intentionally not addressed: it shouldn't appear in a completed eval row, and if it does the typically-empty payload will get judged correctly by the existing rules -- documented as a follow-up spec question. --- .../_evaluators/_tool_call_success/tool_call_success.prompty | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty index 00133f70ef31..59b7361f4287 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty @@ -53,7 +53,7 @@ B. Examine tool result and definition for the tool being called to check whether 1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it: ERROR-CASES: =========== - - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`** (or, for non-Responses-API runtimes, with `[STATUS] error`, `[STATUS] cancelled`, or `[STATUS] canceled`). These annotations indicate the tool call did not produce a usable result -- either because the runtime caught a technical failure (e.g. an exception in the tool, the API surface explicitly marked the call `failed`) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload. + - The tool call or tool result line is annotated with **`[STATUS] failed`** or **`[STATUS] incomplete`**. These annotations indicate the tool call did not produce a usable result -- either because the runtime explicitly marked the call `failed` (an exception in the tool, the API surface returned an error response) or because the call was interrupted before completion (e.g. host timeout, parent-response cancellation surfaced as `incomplete`). They are strong, authoritative failure signals and override any contradictory appearance of the result payload. - The tool call resulted in an error or exception - The tool call failed to run or failed to return - The tool call returned a result that indicates an error or failure