From b85ce989169c02ee9f428900970685cfe6352bcc Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Fri, 5 Jun 2026 08:16:27 -0700 Subject: [PATCH 1/5] Enable ToolCallAccuracy/Input/Success on restricted-tool conversations These three evaluators grade the agent's tool selection, input arguments, and call status -- none consume the (redacted) tool output body -- so the previous unconditional rejection of conversations containing built-in restricted tools (bing_grounding, bing_custom_search, azure_ai_search, azure_fabric, sharepoint_grounding) is now lifted. Implementation: - Set check_for_unsupported_tools=False on each evaluator's input validator in _tool_call_accuracy.py, _tool_input_accuracy.py, _tool_call_success.py. - The underlying ToolDefinitionsValidator / ToolCallsValidator classes are unchanged; GroundednessEvaluator and ToolOutputUtilizationEvaluator still reject restricted tools because they require the tool output body. Tests: - New test_unsupported_tools_validation.py (26 tests) covers: * 15 parametrized cases: each of the 3 evaluators x 5 restricted tools, asserting validate_eval_input returns True for response= payloads. * 1 mixed-tools case. * 10 regression cases asserting the underlying validators still reject restricted tools when check_for_unsupported_tools=True. Versioning: - Bumped _version.py 1.17.0 -> 1.17.1. - Added 1.17.1 (Unreleased) section to CHANGELOG.md under Features Added. --- .../azure-ai-evaluation/CHANGELOG.md | 6 + .../_tool_call_accuracy.py | 2 +- .../_tool_call_success/_tool_call_success.py | 2 +- .../_tool_input_accuracy.py | 2 +- .../azure/ai/evaluation/_version.py | 2 +- .../test_unsupported_tools_validation.py | 184 ++++++++++++++++++ 6 files changed, 194 insertions(+), 4 deletions(-) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..da63da61684b 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,11 @@ # Release History +## 1.17.1 (Unreleased) + +### Features Added + +- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 44e0876bad68..60a818b83274 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py index bae6c9895046..9dc1249dff60 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py @@ -3,4 +3,4 @@ # --------------------------------------------------------- # represents upcoming version -VERSION = "1.17.0" +VERSION = "1.17.1" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py new file mode 100644 index 000000000000..7c87bc34ee06 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -0,0 +1,184 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +""" +Regression tests for the change that lets the three tool evaluators +(ToolCallAccuracy, _ToolInputAccuracy, _ToolCallSuccess) accept +conversations containing restricted built-in tools. + +These evaluators previously rejected any conversation containing tools in +``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, +``azure_ai_search``). Because none of the three grades require the +(redacted) tool output body, the rejection has been lifted by setting +``check_for_unsupported_tools=False`` on each evaluator's input validator. + +The tests below exercise the validator directly so they do not need the +prompty flow or a real model deployment. They also confirm that the +underlying validator class still rejects restricted tools when +``check_for_unsupported_tools=True``, so the behavior change is limited +to the evaluator wiring. +""" + +import pytest + +from azure.ai.evaluation import ToolCallAccuracyEvaluator +from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator +from azure.ai.evaluation._evaluators._common._validators import ( + ToolCallsValidator, + ToolDefinitionsValidator, +) +from azure.ai.evaluation._exceptions import ErrorTarget, EvaluationException + + +RESTRICTED_TOOL_NAMES = [ + "bing_grounding", + "bing_custom_search", + "azure_ai_search", + "azure_fabric", + "sharepoint_grounding", +] + + +def _restricted_response(tool_name: str): + return [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": tool_name, + "arguments": {"query": "anything"}, + } + ], + } + ] + + +def _restricted_tool_definition(tool_name: str): + return { + "name": tool_name, + "description": f"Built-in {tool_name} tool.", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + } + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestRestrictedToolValidationLifted: + """Validator should no longer reject restricted tools for these three evaluators.""" + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + # Should not raise EvaluationException; flag flip made this path legal. + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_success_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config): + """Conversation containing both a function call and a restricted tool call validates cleanly.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Find stock price and weather.", + "response": [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_func", + "name": "get_weather", + "arguments": {"location": "Paris"}, + }, + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": "bing_grounding", + "arguments": {"query": "MSFT stock price"}, + }, + ], + } + ], + "tool_definitions": [ + { + "name": "get_weather", + "type": "function", + "description": "Weather lookup.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + _restricted_tool_definition("bing_grounding"), + ], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestUnderlyingValidatorUnchanged: + """The validator class itself still rejects restricted tools when the flag is on. + + Ensures the behavior change is limited to per-evaluator wiring; the validator + keeps its option to enforce the restricted-tool block for other consumers + (e.g. GroundednessEvaluator). + """ + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_calls_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolCallsValidator( + error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + check_for_unsupported_tools=True, + ) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolDefinitionsValidator( + error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, + requires_query=False, + check_for_unsupported_tools=True, + ) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + From af7b07ab3c2198a2610618fc3798407ecd246bae Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Fri, 5 Jun 2026 10:52:43 -0700 Subject: [PATCH 2/5] ToolCallSuccess: deterministic fail on runtime-reported failure status When any tool_call or tool_result in the response carries a known-failure status (failed, error, incomplete, cancelled/canceled), short-circuit _do_eval to return a deterministic fail result (score=0, _passed=False, _result='fail') without invoking the LLM. The evaluator's scoring contract is explicitly binary -- 'FALSE: at least one tool call failed' -- and the prompty rubric does not consider the status field, so it would otherwise grade only the (typically empty) result body and frequently mis-score failed conversations as passes. Reuses the existing pre-flow short-circuit pattern (_is_intermediate_response / _return_not_applicable_result) for consistency. Status is only populated by upstream converters that preserve it; absent status, behavior is unchanged. Bumps to 1.17.1, adds CHANGELOG entry, and adds 19 focused unit tests. --- .../azure-ai-evaluation/CHANGELOG.md | 1 + .../_tool_call_success/_tool_call_success.py | 68 +++++++ .../test_tool_call_success_evaluator.py | 172 ++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index da63da61684b..698d05033a9a 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features Added - Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 60a818b83274..f3ec39e3843e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Short-circuit: if the agent runtime already reported a failed tool + # execution via a known-failure ``status`` (e.g. "failed", "error", + # "incomplete"), deterministically return ``fail`` without calling the + # LLM. The evaluator's scoring contract is binary -- "FALSE: at least + # one tool call failed" -- and the prompty rubric doesn't see the + # ``status`` field, so it would otherwise grade only the (typically + # empty) result body and frequently mis-score the conversation as a + # pass. ``status`` is only populated by upstream converters that + # preserve it; absent ``status``, behavior is unchanged. + if isinstance(eval_input.get("response"), list): + failed_statuses = _collect_failed_tool_statuses(eval_input["response"]) + if failed_statuses: + reason = ( + "Detected failed tool execution(s) with status " + + ", ".join(sorted(set(failed_statuses))) + + ". Marked as fail without LLM grading." + ) + return { + self._result_key: 0.0, + f"{self._result_key}_score": 0.0, + f"{self._result_key}_passed": False, + f"{self._result_key}_result": "fail", + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": { + "short_circuit": "tool_status", + "failed_statuses": sorted(set(failed_statuses)), + }, + } + if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) @@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions +_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"}) + + +def _collect_failed_tool_statuses(agent_response_msgs): + """Return the list of failure statuses seen on any ``tool_call`` or + ``tool_result`` content block in ``agent_response_msgs``. + + Inputs are intentionally tolerated -- malformed messages / non-dict + content blocks are skipped rather than raised on, so this helper is safe + to call on freshly-deserialized agent traces. + + :param agent_response_msgs: The agent's response message list (already + validated to be a list by the caller). + :type agent_response_msgs: list + :return: A list (with duplicates preserved) of lowercased failure status + strings. Empty list means no failure signal was found. + :rtype: list[str] + """ + found = [] + if not isinstance(agent_response_msgs, list): + return found + for msg in agent_response_msgs: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") in ("tool_call", "tool_result"): + status = block.get("status") + if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES: + found.append(status.lower()) + return found + + def _get_tool_calls_results(agent_response_msgs): """Extract formatted agent tool calls and results from response.""" agent_response_text = [] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py new file mode 100644 index 000000000000..a4bfeb2e3e1a --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -0,0 +1,172 @@ +from unittest.mock import MagicMock + +import pytest + +from azure.ai.evaluation import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( + _collect_failed_tool_statuses, +) + + +# Default prompty mock that always grades as PASS. Tests that exercise the +# deterministic short-circuit path rely on this mock NOT being called. +async def _flow_pass(timeout, **kwargs): # pylint: disable=unused-argument + return { + "llm_output": { + "reason": "All tool calls completed successfully.", + "score": 1, + "properties": {}, + } + } + + +def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None): + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments or {"location": "NYC"}, + } + if status is not None: + block["status"] = status + return {"role": "assistant", "content": [block]} + + +def _tool_result(tool_call_id="call_1", result="72F sunny", status=None): + block = {"type": "tool_result", "tool_result": result} + if status is not None: + block["status"] = status + return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]} + + +# --------------------------------------------------------------------------- +# _collect_failed_tool_statuses +# --------------------------------------------------------------------------- + + +class TestCollectFailedToolStatuses: + def test_no_status_returns_empty(self): + msgs = [_assistant_tool_call(), _tool_result()] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_completed_status_returns_empty(self): + msgs = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + assert _collect_failed_tool_statuses(msgs) == [] + + @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"]) + def test_known_failure_status_on_tool_call_is_collected(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status] + + @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"]) + def test_failure_status_is_case_insensitive(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status.lower()] + + def test_failure_status_on_tool_result_is_collected(self): + msgs = [_assistant_tool_call(), _tool_result(status="failed")] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + def test_unknown_status_string_is_ignored(self): + msgs = [_assistant_tool_call(status="something_weird")] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_non_string_status_is_ignored(self): + msgs = [_assistant_tool_call(status=500)] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_malformed_inputs_are_tolerated(self): + # Non-list input + assert _collect_failed_tool_statuses(None) == [] + assert _collect_failed_tool_statuses("not a list") == [] + # List with non-dict items + dicts with non-list content + msgs = [ + "string entry", + 42, + {"role": "assistant"}, # no content + {"role": "assistant", "content": "not a list"}, + {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]}, + ] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + +# --------------------------------------------------------------------------- +# _do_eval short-circuit +# --------------------------------------------------------------------------- + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestToolCallSuccessShortCircuit: + def test_short_circuits_on_failed_tool_call_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(status="failed"), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success"] == 0.0 + assert result["tool_call_success_score"] == 0.0 + assert result["tool_call_success_passed"] is False + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_status"] == "completed" + assert "failed" in result["tool_call_success_reason"] + props = result["tool_call_success_properties"] + assert props["short_circuit"] == "tool_status" + assert props["failed_statuses"] == ["failed"] + + def test_short_circuits_on_failed_tool_result_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result(status="error")] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_properties"]["failed_statuses"] == ["error"] + + def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(tool_call_id="a", status="failed"), + _tool_result(tool_call_id="a", status="failed"), + _assistant_tool_call(tool_call_id="b", status="error"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + # Reason joins deduped, sorted statuses + assert "error, failed" in result["tool_call_success_reason"] + assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"] + + def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() # Goes to LLM + assert result["tool_call_success_passed"] is True + + def test_no_short_circuit_when_status_absent(self, mock_model_config): + """Back-compat: traces produced by converters that do not preserve + ``status`` continue to be graded by the LLM as before.""" + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() + assert result["tool_call_success_passed"] is True From 89f41295a064533914561c0ecc1f1b7d6e11f984 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 8 Jun 2026 10:41:15 -0700 Subject: [PATCH 3/5] Export _ToolInputAccuracyEvaluator from azure.ai.evaluation top-level namespace Brings _ToolInputAccuracyEvaluator in line with its three sibling tool evaluators (ToolCallAccuracyEvaluator, _ToolCallSuccessEvaluator, _ToolOutputUtilizationEvaluator) which are already exposed on the top-level package. Consumers (notably the Foundry evaluations service catalog) can now import it from azure.ai.evaluation directly instead of reaching into the private _evaluators._tool_input_accuracy submodule. --- sdk/evaluation/azure-ai-evaluation/CHANGELOG.md | 1 + .../azure-ai-evaluation/azure/ai/evaluation/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 698d05033a9a..a14ebd1a5eb2 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -5,6 +5,7 @@ ### Features Added - Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. - `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 6703b2ca111f..f32ade1e90e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -34,6 +34,7 @@ from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator from ._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, @@ -135,6 +136,7 @@ def lazy_import(): "ToolCallAccuracyEvaluator", "_ToolOutputUtilizationEvaluator", "_ToolCallSuccessEvaluator", + "_ToolInputAccuracyEvaluator", "AzureOpenAIGrader", "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", From 24198a3944c2290b1bfdbae4f5214a7601664304 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 8 Jun 2026 12:02:21 -0700 Subject: [PATCH 4/5] Black: drop trailing blank line in test_unsupported_tools_validation.py --- .../tests/unittests/test_unsupported_tools_validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py index 7c87bc34ee06..74bc352448a3 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -181,4 +181,3 @@ def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_n with pytest.raises(EvaluationException) as exc_info: validator.validate_eval_input(eval_input) assert "currently not supported" in str(exc_info.value) - From 5445c226de33ae58bd26df1e367a6511b25b5633 Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 8 Jun 2026 11:31:58 -0700 Subject: [PATCH 5/5] Converter: add bing_custom_search + sharepoint_grounding branches; query/input fallback for AIS, SP, Fabric break_tool_call_into_messages previously had no elif branch for bing_custom_search or sharepoint_grounding, so calls touching either tool were silently dropped before any evaluator could see them. The three status-only tool evaluators (ToolCallAccuracyEvaluator, _ToolInputAccuracyEvaluator, _ToolCallSuccessEvaluator) therefore returned NOT_APPLICABLE on those conversations even after the validator was loosened in PR #47369. Changes: - bing_custom_search: arguments-only branch mirroring bing_grounding (emits a tool_call with the requesturl; no tool_result, since Bing-family results are redacted upstream for compliance). - sharepoint_grounding: arguments + dumped output, mirroring azure_ai_search. Phase 2 will extend the Groundedness extractor to walk the documents structure already present on the tool_result. - azure_ai_search, sharepoint_grounding, fabric_dataagent input branches: switched from direct details[][input] dereference to .get(input) or .get(query) or empty-string fallback. Live agent traces emit the search term under 'query' for all three, which made the existing AIS and Fabric branches surface empty arguments to evaluators (a live bug, not just a Phase 1 prerequisite). - Refreshed the stale March-2025 top-of-function comment to reflect the current set of supported built-ins. Tests: Added 5 new tests in tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py covering bing_custom_search, sharepoint_grounding (input key and output dump), and the query-key fallback for AIS, SP, and Fabric. The new tests construct ToolCall via a small _HybridDict helper instead of going through ToolDecoder, so they do not depend on the agents SDK RunStep* models that have moved between azure.ai.projects.models and azure.ai.agents.models packages. --- .../azure-ai-evaluation/CHANGELOG.md | 2 + .../ai/evaluation/_converters/_models.py | 28 +++- .../test_ai_agent_converter_internals.py | 140 ++++++++++++++++++ 3 files changed, 162 insertions(+), 8 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index a14ebd1a5eb2..40e19710337b 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -7,6 +7,8 @@ - Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. - Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. - `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. +- Extended `break_tool_call_into_messages` in `_converters/_models.py` with explicit branches for `bing_custom_search` (arguments-only, mirroring `bing_grounding` — Bing-family results stay redacted upstream) and `sharepoint_grounding` (arguments + dumped output, mirroring `azure_ai_search`). Both were silently dropped before because the converter had no `elif` branch for them, which meant the three status-only tool evaluators returned `NOT_APPLICABLE` on conversations that touched either tool. The `bing_grounding` and `bing_custom_search` request-side payloads continue to emit only the `requesturl`; the `sharepoint_grounding` result is dumped onto the `tool_result` so a future Groundedness / Tool Output Utilization extractor can read it. +- Made the per-tool argument extraction in `break_tool_call_into_messages` resilient to the `query` vs `input` runtime drift observed on `azure_ai_search`, `sharepoint_grounding`, and `fabric_dataagent`. Each branch now reads `details[""].get("input") or details[""].get("query") or ""` instead of dereferencing `["input"]` directly, so live agent traces (which emit the search term under `query`) no longer surface as empty `arguments` to the evaluators. Behavior is unchanged when the runtime emits `input`. ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py index 443c712a9eac..ce5135fee66c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py @@ -327,11 +327,12 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess # We will use this as our accumulator. messages: List[Message] = [] - # As of March 17th, 2025, we only support custom functions due to built-in code interpreters and bing grounding - # tooling not reporting their function calls in the same way. Code interpreters don't include the tool call at - # all in most of the cases, and bing would only show the API URL, without arguments or results. - # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query. - # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter. + # In addition to custom functions, we support a handful of built-in tools whose runtime payload + # we have explicit branches for below (code_interpreter, file_search, bing_grounding, + # bing_custom_search, azure_ai_search, sharepoint_grounding, fabric_dataagent). Bing variants + # only carry the `requesturl` request side (results are redacted upstream for compliance), so + # they emit just the tool_call message; the others emit both call and result. + # Unknown built-in types are silently skipped by the trailing `return messages`. if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"): # This is the internals of the content object that will be included with the tool call. tool_call_id = tool_call.details.id @@ -351,15 +352,22 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess arguments = {"input": tool_call.details.code_interpreter.input} elif tool_call.details["type"] == "bing_grounding": arguments = {"requesturl": tool_call.details["bing_grounding"]["requesturl"]} + elif tool_call.details["type"] == "bing_custom_search": + arguments = {"requesturl": tool_call.details["bing_custom_search"]["requesturl"]} elif tool_call.details["type"] == "file_search": options = tool_call.details["file_search"]["ranking_options"] arguments = { "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]} } elif tool_call.details["type"] == "azure_ai_search": - arguments = {"input": tool_call.details["azure_ai_search"]["input"]} + ais = tool_call.details["azure_ai_search"] + arguments = {"input": ais.get("input") or ais.get("query") or ""} + elif tool_call.details["type"] == "sharepoint_grounding": + sp = tool_call.details["sharepoint_grounding"] + arguments = {"input": sp.get("input") or sp.get("query") or ""} elif tool_call.details["type"] == "fabric_dataagent": - arguments = {"input": tool_call.details["fabric_dataagent"]["input"]} + fab = tool_call.details["fabric_dataagent"] + arguments = {"input": fab.get("input") or fab.get("query") or ""} else: # unsupported tool type, skip return messages @@ -389,11 +397,15 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess if tool_call.details.type == _CODE_INTERPRETER: output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs] elif tool_call.details.type == _BING_GROUNDING: - return messages # not supported yet from bing grounding tool + return messages # results are redacted upstream for Bing; no tool_result to emit + elif tool_call.details.type == _BING_CUSTOM_SEARCH: + return messages # results are redacted upstream for Bing; no tool_result to emit elif tool_call.details.type == _FILE_SEARCH: output = [result.as_dict() for result in tool_call.details.file_search.results] elif tool_call.details.type == _AZURE_AI_SEARCH: output = tool_call.details.azure_ai_search["output"] + elif tool_call.details.type == _SHAREPOINT_GROUNDING: + output = tool_call.details.sharepoint_grounding["output"] elif tool_call.details.type == _FABRIC_DATAAGENT: output = tool_call.details.fabric_dataagent["output"] except: diff --git a/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py b/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py index 9c0c4df125d1..c496976ea328 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py @@ -37,6 +37,42 @@ from serialization_helper import ToolDecoder, ThreadRunDecoder +class _HybridDict(dict): + """Dict subclass that also exposes its keys as attributes. + + The converter (`break_tool_call_into_messages`) mixes subscript access on the request side + (`tool_call.details["type"]`, `tool_call.details["bing_grounding"]["requesturl"]`) with attribute + access on the result side (`tool_call.details.type`, `tool_call.details.azure_ai_search["output"]`). + The production code path uses typed runtime models (`RunStep*ToolCall`) that satisfy both shapes; + `_HybridDict` mimics that surface in unit tests without depending on the agents SDK models, which + have moved between packages and are not guaranteed to be importable in every test environment. + """ + + def __getattr__(self, name): + try: + return self[name] + except KeyError as e: + raise AttributeError(name) from e + + +def _build_builtin_tool_call(call_id: str, tool_type: str, payload: dict) -> ToolCall: + """Construct a `ToolCall` for a built-in tool without going through `ToolDecoder`. + + `payload` is the per-tool sub-object (e.g. `{"requesturl": "..."}` for Bing or + `{"input": "...", "output": {...}}` for SharePoint). The returned `ToolCall.details` is a + nested `_HybridDict` so both subscript and attribute access work. + """ + details = _HybridDict( + { + "id": call_id, + "type": tool_type, + tool_type: _HybridDict(payload), + } + ) + now = datetime.now() + return ToolCall(created=now, completed=now, details=details) + + class TestAIAgentConverter(unittest.TestCase): def test_is_agent_tool_call(self): # Test case where message is an agent tool call @@ -200,6 +236,110 @@ def test_bing_grounding_tool_calls(self): tool_call_content["arguments"] == {"requesturl": "https://api.bing.microsoft.com/v7.0/search?q="} ) + def test_bing_custom_search_tool_calls(self): + # bing_custom_search mirrors bing_grounding: arguments-only tool_call, no tool_result + # (results are redacted upstream for Bing-family tools). + # Built directly rather than via ToolDecoder so the test does not depend on the + # RunStepBingCustomSearchToolCall model being present in the installed agents SDK. + tool_call = _build_builtin_tool_call( + call_id="call_BCS123", + tool_type="bing_custom_search", + payload={"requesturl": "https://api.bing.microsoft.com/v7.0/custom/search?customconfig=abc&q=foo"}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 1) # Bing variants emit no tool_result + self.assertTrue(isinstance(messages[0], AssistantMessage)) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["type"] == "tool_call") + self.assertTrue(tool_call_content["tool_call_id"] == "call_BCS123") + self.assertTrue(tool_call_content["name"] == "bing_custom_search") + self.assertTrue( + tool_call_content["arguments"] + == {"requesturl": "https://api.bing.microsoft.com/v7.0/custom/search?customconfig=abc&q=foo"} + ) + + def test_sharepoint_grounding_tool_calls(self): + # sharepoint_grounding mirrors azure_ai_search: arguments + dumped output. + # Exercises the `input` argument key on the request side. + tool_call = _build_builtin_tool_call( + call_id="call_SP123", + tool_type="sharepoint_grounding", + payload={ + "input": "quarterly sales report", + "output": { + "documents": [ + { + "title": "Q3 Sales", + "url": "https://contoso.sharepoint.com/Q3.docx", + "content": "Q3 was up 12%", + } + ] + }, + }, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + self.assertTrue(isinstance(messages[0], AssistantMessage)) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["type"] == "tool_call") + self.assertTrue(tool_call_content["tool_call_id"] == "call_SP123") + self.assertTrue(tool_call_content["name"] == "sharepoint_grounding") + self.assertTrue(tool_call_content["arguments"] == {"input": "quarterly sales report"}) + self.assertTrue(isinstance(messages[1], ToolMessage)) + self.assertTrue(messages[1].content[0]["type"] == "tool_result") + self.assertTrue( + messages[1].content[0]["tool_result"] + == { + "documents": [ + { + "title": "Q3 Sales", + "url": "https://contoso.sharepoint.com/Q3.docx", + "content": "Q3 was up 12%", + } + ] + } + ) + + def test_sharepoint_grounding_tool_calls_query_key_fallback(self): + # Live agent traces emit the search term under `query` instead of `input` for SharePoint. + # The converter must fall back to `query` so downstream evaluators see a non-empty argument. + tool_call = _build_builtin_tool_call( + call_id="call_SP456", + tool_type="sharepoint_grounding", + payload={"query": "vacation policy", "output": {"documents": []}}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["arguments"] == {"input": "vacation policy"}) + + def test_azure_ai_search_tool_calls_query_key_fallback(self): + # Live agent traces emit the search term under `query` instead of `input` for Azure AI Search. + # The converter must fall back to `query` so downstream evaluators see a non-empty argument. + tool_call = _build_builtin_tool_call( + call_id="call_AIS789", + tool_type="azure_ai_search", + payload={"query": "refund policy", "output": []}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["name"] == "azure_ai_search") + self.assertTrue(tool_call_content["arguments"] == {"input": "refund policy"}) + + def test_fabric_dataagent_tool_calls_query_key_fallback(self): + # Same `query` vs `input` drift for fabric_dataagent. + tool_call = _build_builtin_tool_call( + call_id="call_FAB012", + tool_type="fabric_dataagent", + payload={"query": "top customers by revenue", "output": {}}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["name"] == "fabric_dataagent") + self.assertTrue(tool_call_content["arguments"] == {"input": "top customers by revenue"}) + def test_extract_tool_definitions(self): thread_run_data = """{ "id": "run_zs3USbTw61ZpRk8bwBPP8Ue7",