diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..40e19710337b 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,15 @@ # Release History +## 1.17.1 (Unreleased) + +### Features Added + +- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body. +- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line. +- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated. +- Extended `break_tool_call_into_messages` in `_converters/_models.py` with explicit branches for `bing_custom_search` (arguments-only, mirroring `bing_grounding` — Bing-family results stay redacted upstream) and `sharepoint_grounding` (arguments + dumped output, mirroring `azure_ai_search`). Both were silently dropped before because the converter had no `elif` branch for them, which meant the three status-only tool evaluators returned `NOT_APPLICABLE` on conversations that touched either tool. The `bing_grounding` and `bing_custom_search` request-side payloads continue to emit only the `requesturl`; the `sharepoint_grounding` result is dumped onto the `tool_result` so a future Groundedness / Tool Output Utilization extractor can read it. +- Made the per-tool argument extraction in `break_tool_call_into_messages` resilient to the `query` vs `input` runtime drift observed on `azure_ai_search`, `sharepoint_grounding`, and `fabric_dataagent`. Each branch now reads `details[""].get("input") or details[""].get("query") or ""` instead of dereferencing `["input"]` directly, so live agent traces (which emit the search term under `query`) no longer surface as empty `arguments` to the evaluators. Behavior is unchanged when the runtime emits `input`. + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index 6703b2ca111f..f32ade1e90e3 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -34,6 +34,7 @@ from ._evaluators._document_retrieval import DocumentRetrievalEvaluator from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator from ._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, @@ -135,6 +136,7 @@ def lazy_import(): "ToolCallAccuracyEvaluator", "_ToolOutputUtilizationEvaluator", "_ToolCallSuccessEvaluator", + "_ToolInputAccuracyEvaluator", "AzureOpenAIGrader", "AzureOpenAILabelGrader", "AzureOpenAIStringCheckGrader", diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py index 443c712a9eac..ce5135fee66c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_converters/_models.py @@ -327,11 +327,12 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess # We will use this as our accumulator. messages: List[Message] = [] - # As of March 17th, 2025, we only support custom functions due to built-in code interpreters and bing grounding - # tooling not reporting their function calls in the same way. Code interpreters don't include the tool call at - # all in most of the cases, and bing would only show the API URL, without arguments or results. - # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query. - # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter. + # In addition to custom functions, we support a handful of built-in tools whose runtime payload + # we have explicit branches for below (code_interpreter, file_search, bing_grounding, + # bing_custom_search, azure_ai_search, sharepoint_grounding, fabric_dataagent). Bing variants + # only carry the `requesturl` request side (results are redacted upstream for compliance), so + # they emit just the tool_call message; the others emit both call and result. + # Unknown built-in types are silently skipped by the trailing `return messages`. if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"): # This is the internals of the content object that will be included with the tool call. tool_call_id = tool_call.details.id @@ -351,15 +352,22 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess arguments = {"input": tool_call.details.code_interpreter.input} elif tool_call.details["type"] == "bing_grounding": arguments = {"requesturl": tool_call.details["bing_grounding"]["requesturl"]} + elif tool_call.details["type"] == "bing_custom_search": + arguments = {"requesturl": tool_call.details["bing_custom_search"]["requesturl"]} elif tool_call.details["type"] == "file_search": options = tool_call.details["file_search"]["ranking_options"] arguments = { "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]} } elif tool_call.details["type"] == "azure_ai_search": - arguments = {"input": tool_call.details["azure_ai_search"]["input"]} + ais = tool_call.details["azure_ai_search"] + arguments = {"input": ais.get("input") or ais.get("query") or ""} + elif tool_call.details["type"] == "sharepoint_grounding": + sp = tool_call.details["sharepoint_grounding"] + arguments = {"input": sp.get("input") or sp.get("query") or ""} elif tool_call.details["type"] == "fabric_dataagent": - arguments = {"input": tool_call.details["fabric_dataagent"]["input"]} + fab = tool_call.details["fabric_dataagent"] + arguments = {"input": fab.get("input") or fab.get("query") or ""} else: # unsupported tool type, skip return messages @@ -389,11 +397,15 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess if tool_call.details.type == _CODE_INTERPRETER: output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs] elif tool_call.details.type == _BING_GROUNDING: - return messages # not supported yet from bing grounding tool + return messages # results are redacted upstream for Bing; no tool_result to emit + elif tool_call.details.type == _BING_CUSTOM_SEARCH: + return messages # results are redacted upstream for Bing; no tool_result to emit elif tool_call.details.type == _FILE_SEARCH: output = [result.as_dict() for result in tool_call.details.file_search.results] elif tool_call.details.type == _AZURE_AI_SEARCH: output = tool_call.details.azure_ai_search["output"] + elif tool_call.details.type == _SHAREPOINT_GROUNDING: + output = tool_call.details.sharepoint_grounding["output"] elif tool_call.details.type == _FABRIC_DATAAGENT: output = tool_call.details.fabric_dataagent["output"] except: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 3a2ccb1ace85..f5057f09e947 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, # Initialize input validator self._validator = ToolCallsValidator( error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py index 44e0876bad68..f3ec39e3843e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py @@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs): self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, requires_query=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( @@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, ) + # Short-circuit: if the agent runtime already reported a failed tool + # execution via a known-failure ``status`` (e.g. "failed", "error", + # "incomplete"), deterministically return ``fail`` without calling the + # LLM. The evaluator's scoring contract is binary -- "FALSE: at least + # one tool call failed" -- and the prompty rubric doesn't see the + # ``status`` field, so it would otherwise grade only the (typically + # empty) result body and frequently mis-score the conversation as a + # pass. ``status`` is only populated by upstream converters that + # preserve it; absent ``status``, behavior is unchanged. + if isinstance(eval_input.get("response"), list): + failed_statuses = _collect_failed_tool_statuses(eval_input["response"]) + if failed_statuses: + reason = ( + "Detected failed tool execution(s) with status " + + ", ".join(sorted(set(failed_statuses))) + + ". Marked as fail without LLM grading." + ) + return { + self._result_key: 0.0, + f"{self._result_key}_score": 0.0, + f"{self._result_key}_passed": False, + f"{self._result_key}_result": "fail", + f"{self._result_key}_reason": reason, + f"{self._result_key}_status": "completed", + f"{self._result_key}_threshold": self._threshold, + f"{self._result_key}_properties": { + "short_circuit": "tool_status", + "failed_statuses": sorted(set(failed_statuses)), + }, + } + if isinstance(eval_input.get("response"), list): eval_input["response"] = _preprocess_messages(eval_input["response"]) eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger) @@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None): return tool_definitions +_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"}) + + +def _collect_failed_tool_statuses(agent_response_msgs): + """Return the list of failure statuses seen on any ``tool_call`` or + ``tool_result`` content block in ``agent_response_msgs``. + + Inputs are intentionally tolerated -- malformed messages / non-dict + content blocks are skipped rather than raised on, so this helper is safe + to call on freshly-deserialized agent traces. + + :param agent_response_msgs: The agent's response message list (already + validated to be a list by the caller). + :type agent_response_msgs: list + :return: A list (with duplicates preserved) of lowercased failure status + strings. Empty list means no failure signal was found. + :rtype: list[str] + """ + found = [] + if not isinstance(agent_response_msgs, list): + return found + for msg in agent_response_msgs: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if not isinstance(content, list): + continue + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") in ("tool_call", "tool_result"): + status = block.get("status") + if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES: + found.append(status.lower()) + return found + + def _get_tool_calls_results(agent_response_msgs): """Extract formatted agent tool calls and results from response.""" agent_response_text = [] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 7ebc20c7e130..198fefde02d1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -92,7 +92,7 @@ def __init__( self._validator = ToolDefinitionsValidator( error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, optional_tool_definitions=False, - check_for_unsupported_tools=True, + check_for_unsupported_tools=False, ) super().__init__( diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py index bae6c9895046..9dc1249dff60 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py @@ -3,4 +3,4 @@ # --------------------------------------------------------- # represents upcoming version -VERSION = "1.17.0" +VERSION = "1.17.1" diff --git a/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py b/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py index 9c0c4df125d1..c496976ea328 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/converters/ai_agent_converter/test_ai_agent_converter_internals.py @@ -37,6 +37,42 @@ from serialization_helper import ToolDecoder, ThreadRunDecoder +class _HybridDict(dict): + """Dict subclass that also exposes its keys as attributes. + + The converter (`break_tool_call_into_messages`) mixes subscript access on the request side + (`tool_call.details["type"]`, `tool_call.details["bing_grounding"]["requesturl"]`) with attribute + access on the result side (`tool_call.details.type`, `tool_call.details.azure_ai_search["output"]`). + The production code path uses typed runtime models (`RunStep*ToolCall`) that satisfy both shapes; + `_HybridDict` mimics that surface in unit tests without depending on the agents SDK models, which + have moved between packages and are not guaranteed to be importable in every test environment. + """ + + def __getattr__(self, name): + try: + return self[name] + except KeyError as e: + raise AttributeError(name) from e + + +def _build_builtin_tool_call(call_id: str, tool_type: str, payload: dict) -> ToolCall: + """Construct a `ToolCall` for a built-in tool without going through `ToolDecoder`. + + `payload` is the per-tool sub-object (e.g. `{"requesturl": "..."}` for Bing or + `{"input": "...", "output": {...}}` for SharePoint). The returned `ToolCall.details` is a + nested `_HybridDict` so both subscript and attribute access work. + """ + details = _HybridDict( + { + "id": call_id, + "type": tool_type, + tool_type: _HybridDict(payload), + } + ) + now = datetime.now() + return ToolCall(created=now, completed=now, details=details) + + class TestAIAgentConverter(unittest.TestCase): def test_is_agent_tool_call(self): # Test case where message is an agent tool call @@ -200,6 +236,110 @@ def test_bing_grounding_tool_calls(self): tool_call_content["arguments"] == {"requesturl": "https://api.bing.microsoft.com/v7.0/search?q="} ) + def test_bing_custom_search_tool_calls(self): + # bing_custom_search mirrors bing_grounding: arguments-only tool_call, no tool_result + # (results are redacted upstream for Bing-family tools). + # Built directly rather than via ToolDecoder so the test does not depend on the + # RunStepBingCustomSearchToolCall model being present in the installed agents SDK. + tool_call = _build_builtin_tool_call( + call_id="call_BCS123", + tool_type="bing_custom_search", + payload={"requesturl": "https://api.bing.microsoft.com/v7.0/custom/search?customconfig=abc&q=foo"}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 1) # Bing variants emit no tool_result + self.assertTrue(isinstance(messages[0], AssistantMessage)) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["type"] == "tool_call") + self.assertTrue(tool_call_content["tool_call_id"] == "call_BCS123") + self.assertTrue(tool_call_content["name"] == "bing_custom_search") + self.assertTrue( + tool_call_content["arguments"] + == {"requesturl": "https://api.bing.microsoft.com/v7.0/custom/search?customconfig=abc&q=foo"} + ) + + def test_sharepoint_grounding_tool_calls(self): + # sharepoint_grounding mirrors azure_ai_search: arguments + dumped output. + # Exercises the `input` argument key on the request side. + tool_call = _build_builtin_tool_call( + call_id="call_SP123", + tool_type="sharepoint_grounding", + payload={ + "input": "quarterly sales report", + "output": { + "documents": [ + { + "title": "Q3 Sales", + "url": "https://contoso.sharepoint.com/Q3.docx", + "content": "Q3 was up 12%", + } + ] + }, + }, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + self.assertTrue(isinstance(messages[0], AssistantMessage)) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["type"] == "tool_call") + self.assertTrue(tool_call_content["tool_call_id"] == "call_SP123") + self.assertTrue(tool_call_content["name"] == "sharepoint_grounding") + self.assertTrue(tool_call_content["arguments"] == {"input": "quarterly sales report"}) + self.assertTrue(isinstance(messages[1], ToolMessage)) + self.assertTrue(messages[1].content[0]["type"] == "tool_result") + self.assertTrue( + messages[1].content[0]["tool_result"] + == { + "documents": [ + { + "title": "Q3 Sales", + "url": "https://contoso.sharepoint.com/Q3.docx", + "content": "Q3 was up 12%", + } + ] + } + ) + + def test_sharepoint_grounding_tool_calls_query_key_fallback(self): + # Live agent traces emit the search term under `query` instead of `input` for SharePoint. + # The converter must fall back to `query` so downstream evaluators see a non-empty argument. + tool_call = _build_builtin_tool_call( + call_id="call_SP456", + tool_type="sharepoint_grounding", + payload={"query": "vacation policy", "output": {"documents": []}}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["arguments"] == {"input": "vacation policy"}) + + def test_azure_ai_search_tool_calls_query_key_fallback(self): + # Live agent traces emit the search term under `query` instead of `input` for Azure AI Search. + # The converter must fall back to `query` so downstream evaluators see a non-empty argument. + tool_call = _build_builtin_tool_call( + call_id="call_AIS789", + tool_type="azure_ai_search", + payload={"query": "refund policy", "output": []}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["name"] == "azure_ai_search") + self.assertTrue(tool_call_content["arguments"] == {"input": "refund policy"}) + + def test_fabric_dataagent_tool_calls_query_key_fallback(self): + # Same `query` vs `input` drift for fabric_dataagent. + tool_call = _build_builtin_tool_call( + call_id="call_FAB012", + tool_type="fabric_dataagent", + payload={"query": "top customers by revenue", "output": {}}, + ) + messages = break_tool_call_into_messages(tool_call, "abc123") + self.assertTrue(len(messages) == 2) + tool_call_content = messages[0].content[0] + self.assertTrue(tool_call_content["name"] == "fabric_dataagent") + self.assertTrue(tool_call_content["arguments"] == {"input": "top customers by revenue"}) + def test_extract_tool_definitions(self): thread_run_data = """{ "id": "run_zs3USbTw61ZpRk8bwBPP8Ue7", diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py new file mode 100644 index 000000000000..a4bfeb2e3e1a --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py @@ -0,0 +1,172 @@ +from unittest.mock import MagicMock + +import pytest + +from azure.ai.evaluation import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import ( + _collect_failed_tool_statuses, +) + + +# Default prompty mock that always grades as PASS. Tests that exercise the +# deterministic short-circuit path rely on this mock NOT being called. +async def _flow_pass(timeout, **kwargs): # pylint: disable=unused-argument + return { + "llm_output": { + "reason": "All tool calls completed successfully.", + "score": 1, + "properties": {}, + } + } + + +def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None): + block = { + "type": "tool_call", + "tool_call_id": tool_call_id, + "name": name, + "arguments": arguments or {"location": "NYC"}, + } + if status is not None: + block["status"] = status + return {"role": "assistant", "content": [block]} + + +def _tool_result(tool_call_id="call_1", result="72F sunny", status=None): + block = {"type": "tool_result", "tool_result": result} + if status is not None: + block["status"] = status + return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]} + + +# --------------------------------------------------------------------------- +# _collect_failed_tool_statuses +# --------------------------------------------------------------------------- + + +class TestCollectFailedToolStatuses: + def test_no_status_returns_empty(self): + msgs = [_assistant_tool_call(), _tool_result()] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_completed_status_returns_empty(self): + msgs = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + assert _collect_failed_tool_statuses(msgs) == [] + + @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"]) + def test_known_failure_status_on_tool_call_is_collected(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status] + + @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"]) + def test_failure_status_is_case_insensitive(self, status): + msgs = [_assistant_tool_call(status=status)] + assert _collect_failed_tool_statuses(msgs) == [status.lower()] + + def test_failure_status_on_tool_result_is_collected(self): + msgs = [_assistant_tool_call(), _tool_result(status="failed")] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + def test_unknown_status_string_is_ignored(self): + msgs = [_assistant_tool_call(status="something_weird")] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_non_string_status_is_ignored(self): + msgs = [_assistant_tool_call(status=500)] + assert _collect_failed_tool_statuses(msgs) == [] + + def test_malformed_inputs_are_tolerated(self): + # Non-list input + assert _collect_failed_tool_statuses(None) == [] + assert _collect_failed_tool_statuses("not a list") == [] + # List with non-dict items + dicts with non-list content + msgs = [ + "string entry", + 42, + {"role": "assistant"}, # no content + {"role": "assistant", "content": "not a list"}, + {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]}, + ] + assert _collect_failed_tool_statuses(msgs) == ["failed"] + + +# --------------------------------------------------------------------------- +# _do_eval short-circuit +# --------------------------------------------------------------------------- + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestToolCallSuccessShortCircuit: + def test_short_circuits_on_failed_tool_call_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(status="failed"), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success"] == 0.0 + assert result["tool_call_success_score"] == 0.0 + assert result["tool_call_success_passed"] is False + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_status"] == "completed" + assert "failed" in result["tool_call_success_reason"] + props = result["tool_call_success_properties"] + assert props["short_circuit"] == "tool_status" + assert props["failed_statuses"] == ["failed"] + + def test_short_circuits_on_failed_tool_result_status(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result(status="error")] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + assert result["tool_call_success_result"] == "fail" + assert result["tool_call_success_properties"]["failed_statuses"] == ["error"] + + def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(tool_call_id="a", status="failed"), + _tool_result(tool_call_id="a", status="failed"), + _assistant_tool_call(tool_call_id="b", status="error"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_not_called() + # Reason joins deduped, sorted statuses + assert "error, failed" in result["tool_call_success_reason"] + assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"] + + def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [ + _assistant_tool_call(status="completed"), + _tool_result(status="completed"), + ] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() # Goes to LLM + assert result["tool_call_success_passed"] is True + + def test_no_short_circuit_when_status_absent(self, mock_model_config): + """Back-compat: traces produced by converters that do not preserve + ``status`` continue to be graded by the LLM as before.""" + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=_flow_pass) + + response = [_assistant_tool_call(), _tool_result()] + result = evaluator(response=response) + + evaluator._flow.assert_called_once() + assert result["tool_call_success_passed"] is True diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py new file mode 100644 index 000000000000..74bc352448a3 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py @@ -0,0 +1,183 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +""" +Regression tests for the change that lets the three tool evaluators +(ToolCallAccuracy, _ToolInputAccuracy, _ToolCallSuccess) accept +conversations containing restricted built-in tools. + +These evaluators previously rejected any conversation containing tools in +``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``, +``azure_ai_search``). Because none of the three grades require the +(redacted) tool output body, the rejection has been lifted by setting +``check_for_unsupported_tools=False`` on each evaluator's input validator. + +The tests below exercise the validator directly so they do not need the +prompty flow or a real model deployment. They also confirm that the +underlying validator class still rejects restricted tools when +``check_for_unsupported_tools=True``, so the behavior change is limited +to the evaluator wiring. +""" + +import pytest + +from azure.ai.evaluation import ToolCallAccuracyEvaluator +from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator +from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator +from azure.ai.evaluation._evaluators._common._validators import ( + ToolCallsValidator, + ToolDefinitionsValidator, +) +from azure.ai.evaluation._exceptions import ErrorTarget, EvaluationException + + +RESTRICTED_TOOL_NAMES = [ + "bing_grounding", + "bing_custom_search", + "azure_ai_search", + "azure_fabric", + "sharepoint_grounding", +] + + +def _restricted_response(tool_name: str): + return [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": tool_name, + "arguments": {"query": "anything"}, + } + ], + } + ] + + +def _restricted_tool_definition(tool_name: str): + return { + "name": tool_name, + "description": f"Built-in {tool_name} tool.", + "parameters": { + "type": "object", + "properties": {"query": {"type": "string"}}, + }, + } + + +@pytest.mark.usefixtures("mock_model_config") +@pytest.mark.unittest +class TestRestrictedToolValidationLifted: + """Validator should no longer reject restricted tools for these three evaluators.""" + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + # Should not raise EvaluationException; flag flip made this path legal. + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_call_success_accepts_restricted_tool(self, mock_model_config, tool_name): + evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config): + """Conversation containing both a function call and a restricted tool call validates cleanly.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + eval_input = { + "query": "Find stock price and weather.", + "response": [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_func", + "name": "get_weather", + "arguments": {"location": "Paris"}, + }, + { + "type": "tool_call", + "tool_call_id": "call_restricted", + "name": "bing_grounding", + "arguments": {"query": "MSFT stock price"}, + }, + ], + } + ], + "tool_definitions": [ + { + "name": "get_weather", + "type": "function", + "description": "Weather lookup.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + _restricted_tool_definition("bing_grounding"), + ], + } + assert evaluator._validator.validate_eval_input(eval_input) is True + + +@pytest.mark.unittest +class TestUnderlyingValidatorUnchanged: + """The validator class itself still rejects restricted tools when the flag is on. + + Ensures the behavior change is limited to per-evaluator wiring; the validator + keeps its option to enforce the restricted-tool block for other consumers + (e.g. GroundednessEvaluator). + """ + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_calls_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolCallsValidator( + error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + check_for_unsupported_tools=True, + ) + eval_input = { + "query": "Look it up.", + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value) + + @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES) + def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_name): + validator = ToolDefinitionsValidator( + error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR, + requires_query=False, + check_for_unsupported_tools=True, + ) + eval_input = { + "response": _restricted_response(tool_name), + "tool_definitions": [_restricted_tool_definition(tool_name)], + } + with pytest.raises(EvaluationException) as exc_info: + validator.validate_eval_input(eval_input) + assert "currently not supported" in str(exc_info.value)