Azure · mmkawale · Jun 5, 2026 · Jun 5, 2026 · Jun 8, 2026 · Jun 8, 2026
@@ -1,5 +1,15 @@
 # Release History
 
+## 1.17.1 (Unreleased)
+
+### Features Added
+
+- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body.
+- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line.
+- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated.
+- Extended `break_tool_call_into_messages` in `_converters/_models.py` with explicit branches for `bing_custom_search` (arguments-only, mirroring `bing_grounding` — Bing-family results stay redacted upstream) and `sharepoint_grounding` (arguments + dumped output, mirroring `azure_ai_search`). Both were silently dropped before because the converter had no `elif` branch for them, which meant the three status-only tool evaluators returned `NOT_APPLICABLE` on conversations that touched either tool. The `bing_grounding` and `bing_custom_search` request-side payloads continue to emit only the `requesturl`; the `sharepoint_grounding` result is dumped onto the `tool_result` so a future Groundedness / Tool Output Utilization extractor can read it.
+- Made the per-tool argument extraction in `break_tool_call_into_messages` resilient to the `query` vs `input` runtime drift observed on `azure_ai_search`, `sharepoint_grounding`, and `fabric_dataagent`. Each branch now reads `details["<tool>"].get("input") or details["<tool>"].get("query") or ""` instead of dereferencing `["input"]` directly, so live agent traces (which emit the search term under `query`) no longer surface as empty `arguments` to the evaluators. Behavior is unchanged when the runtime emits `input`.
+
 ## 1.17.0 (2026-06-03)
 
 ### Breaking Changes

@@ -34,6 +34,7 @@
 from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
 from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
+from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -135,6 +136,7 @@ def lazy_import():
     "ToolCallAccuracyEvaluator",
     "_ToolOutputUtilizationEvaluator",
     "_ToolCallSuccessEvaluator",
+    "_ToolInputAccuracyEvaluator",
     "AzureOpenAIGrader",
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",

@@ -327,11 +327,12 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
     # We will use this as our accumulator.
     messages: List[Message] = []
 
-    # As of March 17th, 2025, we only support custom functions due to built-in code interpreters and bing grounding
-    # tooling not reporting their function calls in the same way. Code interpreters don't include the tool call at
-    # all in most of the cases, and bing would only show the API URL, without arguments or results.
-    # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
-    # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
+    # In addition to custom functions, we support a handful of built-in tools whose runtime payload
+    # we have explicit branches for below (code_interpreter, file_search, bing_grounding,
+    # bing_custom_search, azure_ai_search, sharepoint_grounding, fabric_dataagent). Bing variants
+    # only carry the `requesturl` request side (results are redacted upstream for compliance), so
+    # they emit just the tool_call message; the others emit both call and result.
+    # Unknown built-in types are silently skipped by the trailing `return messages`.
     if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
         # This is the internals of the content object that will be included with the tool call.
         tool_call_id = tool_call.details.id
@@ -351,15 +352,22 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
             arguments = {"input": tool_call.details.code_interpreter.input}
         elif tool_call.details["type"] == "bing_grounding":
             arguments = {"requesturl": tool_call.details["bing_grounding"]["requesturl"]}
+        elif tool_call.details["type"] == "bing_custom_search":
+            arguments = {"requesturl": tool_call.details["bing_custom_search"]["requesturl"]}
         elif tool_call.details["type"] == "file_search":
             options = tool_call.details["file_search"]["ranking_options"]
             arguments = {
                 "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
             }
         elif tool_call.details["type"] == "azure_ai_search":
-            arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
+            ais = tool_call.details["azure_ai_search"]
+            arguments = {"input": ais.get("input") or ais.get("query") or ""}
+        elif tool_call.details["type"] == "sharepoint_grounding":
+            sp = tool_call.details["sharepoint_grounding"]
+            arguments = {"input": sp.get("input") or sp.get("query") or ""}
         elif tool_call.details["type"] == "fabric_dataagent":
-            arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
+            fab = tool_call.details["fabric_dataagent"]
+            arguments = {"input": fab.get("input") or fab.get("query") or ""}
         else:
             # unsupported tool type, skip
             return messages
@@ -389,11 +397,15 @@ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Mess
             if tool_call.details.type == _CODE_INTERPRETER:
                 output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs]
             elif tool_call.details.type == _BING_GROUNDING:
-                return messages  # not supported yet from bing grounding tool
+                return messages  # results are redacted upstream for Bing; no tool_result to emit
+            elif tool_call.details.type == _BING_CUSTOM_SEARCH:
+                return messages  # results are redacted upstream for Bing; no tool_result to emit
             elif tool_call.details.type == _FILE_SEARCH:
                 output = [result.as_dict() for result in tool_call.details.file_search.results]
             elif tool_call.details.type == _AZURE_AI_SEARCH:
                 output = tool_call.details.azure_ai_search["output"]
+            elif tool_call.details.type == _SHAREPOINT_GROUNDING:
+                output = tool_call.details.sharepoint_grounding["output"]
             elif tool_call.details.type == _FABRIC_DATAAGENT:
                 output = tool_call.details.fabric_dataagent["output"]
         except:

@@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
         # Initialize input validator
         self._validator = ToolCallsValidator(
             error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs):
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             requires_query=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
@@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
+        # Short-circuit: if the agent runtime already reported a failed tool
+        # execution via a known-failure ``status`` (e.g. "failed", "error",
+        # "incomplete"), deterministically return ``fail`` without calling the
+        # LLM. The evaluator's scoring contract is binary -- "FALSE: at least
+        # one tool call failed" -- and the prompty rubric doesn't see the
+        # ``status`` field, so it would otherwise grade only the (typically
+        # empty) result body and frequently mis-score the conversation as a
+        # pass. ``status`` is only populated by upstream converters that
+        # preserve it; absent ``status``, behavior is unchanged.
+        if isinstance(eval_input.get("response"), list):
+            failed_statuses = _collect_failed_tool_statuses(eval_input["response"])
+            if failed_statuses:
+                reason = (
+                    "Detected failed tool execution(s) with status "
+                    + ", ".join(sorted(set(failed_statuses)))
+                    + ". Marked as fail without LLM grading."
+                )
+                return {
+                    self._result_key: 0.0,
+                    f"{self._result_key}_score": 0.0,
+                    f"{self._result_key}_passed": False,
+                    f"{self._result_key}_result": "fail",
+                    f"{self._result_key}_reason": reason,
+                    f"{self._result_key}_status": "completed",
+                    f"{self._result_key}_threshold": self._threshold,
+                    f"{self._result_key}_properties": {
+                        "short_circuit": "tool_status",
+                        "failed_statuses": sorted(set(failed_statuses)),
+                    },
+                }
+
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
             eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
@@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
         return tool_definitions
 
 
+_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"})
+
+
+def _collect_failed_tool_statuses(agent_response_msgs):
+    """Return the list of failure statuses seen on any ``tool_call`` or
+    ``tool_result`` content block in ``agent_response_msgs``.
+
+    Inputs are intentionally tolerated -- malformed messages / non-dict
+    content blocks are skipped rather than raised on, so this helper is safe
+    to call on freshly-deserialized agent traces.
+
+    :param agent_response_msgs: The agent's response message list (already
+        validated to be a list by the caller).
+    :type agent_response_msgs: list
+    :return: A list (with duplicates preserved) of lowercased failure status
+        strings. Empty list means no failure signal was found.
+    :rtype: list[str]
+    """
+    found = []
+    if not isinstance(agent_response_msgs, list):
+        return found
+    for msg in agent_response_msgs:
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            if block.get("type") in ("tool_call", "tool_result"):
+                status = block.get("status")
+                if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES:
+                    found.append(status.lower())
+    return found
+
+
 def _get_tool_calls_results(agent_response_msgs):
     """Extract formatted agent tool calls and results from response."""
     agent_response_text = []

@@ -92,7 +92,7 @@ def __init__(
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             optional_tool_definitions=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(

@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
 
-VERSION = "1.17.0"
+VERSION = "1.17.1"