diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
index 576ed70a4396..a14ebd1a5eb2 100644
--- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Release History
 
+## 1.17.1 (Unreleased)
+
+### Features Added
+
+- Enabled `ToolCallAccuracyEvaluator`, `_ToolInputAccuracyEvaluator`, and `_ToolCallSuccessEvaluator` to run on conversations that include built-in restricted tools (`bing_grounding`, `bing_custom_search`, `azure_ai_search`, `azure_fabric`, `sharepoint_grounding`). These three evaluators grade the agent's tool selection, input arguments, and call status — none of which require the (redacted) tool output body — so the previous unconditional rejection of conversations containing restricted tools is now lifted. Achieved by setting `check_for_unsupported_tools=False` on each evaluator's input validator. `GroundednessEvaluator` and `ToolOutputUtilizationEvaluator` continue to reject restricted tools because they consume the tool output body.
+- Exported `_ToolInputAccuracyEvaluator` from the top-level `azure.ai.evaluation` namespace so consumers no longer need to reach into the private `_evaluators._tool_input_accuracy` submodule. The other tool evaluators were already exposed there; this brings the four siblings in line.
+- `_ToolCallSuccessEvaluator` now deterministically returns `fail` (score `0`, `_passed=False`) without invoking the LLM when any `tool_call` or `tool_result` in the response carries a known-failure `status` (`failed`, `error`, `incomplete`, `cancelled`/`canceled`). This matches the evaluator's binary contract ("FALSE: at least one tool call failed") and prevents the prompty rubric -- which doesn't see the `status` field -- from mis-grading conversations whose only failure signal is the runtime-reported execution status. Behavior is unchanged for responses where no `status` is populated.
+
 ## 1.17.0 (2026-06-03)
 
 ### Breaking Changes
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
index 6703b2ca111f..f32ade1e90e3 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -34,6 +34,7 @@
 from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
 from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
+from ._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -135,6 +136,7 @@ def lazy_import():
     "ToolCallAccuracyEvaluator",
     "_ToolOutputUtilizationEvaluator",
     "_ToolCallSuccessEvaluator",
+    "_ToolInputAccuracyEvaluator",
     "AzureOpenAIGrader",
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
index 3a2ccb1ace85..f5057f09e947 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py
@@ -103,7 +103,7 @@ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
         # Initialize input validator
         self._validator = ToolCallsValidator(
             error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
index 44e0876bad68..f3ec39e3843e 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
@@ -78,7 +78,7 @@ def __init__(self, model_config, *, credential=None, **kwargs):
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             requires_query=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
@@ -179,6 +179,37 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
                 target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
+        # Short-circuit: if the agent runtime already reported a failed tool
+        # execution via a known-failure ``status`` (e.g. "failed", "error",
+        # "incomplete"), deterministically return ``fail`` without calling the
+        # LLM. The evaluator's scoring contract is binary -- "FALSE: at least
+        # one tool call failed" -- and the prompty rubric doesn't see the
+        # ``status`` field, so it would otherwise grade only the (typically
+        # empty) result body and frequently mis-score the conversation as a
+        # pass. ``status`` is only populated by upstream converters that
+        # preserve it; absent ``status``, behavior is unchanged.
+        if isinstance(eval_input.get("response"), list):
+            failed_statuses = _collect_failed_tool_statuses(eval_input["response"])
+            if failed_statuses:
+                reason = (
+                    "Detected failed tool execution(s) with status "
+                    + ", ".join(sorted(set(failed_statuses)))
+                    + ". Marked as fail without LLM grading."
+                )
+                return {
+                    self._result_key: 0.0,
+                    f"{self._result_key}_score": 0.0,
+                    f"{self._result_key}_passed": False,
+                    f"{self._result_key}_result": "fail",
+                    f"{self._result_key}_reason": reason,
+                    f"{self._result_key}_status": "completed",
+                    f"{self._result_key}_threshold": self._threshold,
+                    f"{self._result_key}_properties": {
+                        "short_circuit": "tool_status",
+                        "failed_statuses": sorted(set(failed_statuses)),
+                    },
+                }
+
         if isinstance(eval_input.get("response"), list):
             eval_input["response"] = _preprocess_messages(eval_input["response"])
             eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
@@ -271,6 +302,43 @@ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
         return tool_definitions
 
 
+_FAILED_TOOL_STATUSES = frozenset({"failed", "error", "incomplete", "cancelled", "canceled"})
+
+
+def _collect_failed_tool_statuses(agent_response_msgs):
+    """Return the list of failure statuses seen on any ``tool_call`` or
+    ``tool_result`` content block in ``agent_response_msgs``.
+
+    Inputs are intentionally tolerated -- malformed messages / non-dict
+    content blocks are skipped rather than raised on, so this helper is safe
+    to call on freshly-deserialized agent traces.
+
+    :param agent_response_msgs: The agent's response message list (already
+        validated to be a list by the caller).
+    :type agent_response_msgs: list
+    :return: A list (with duplicates preserved) of lowercased failure status
+        strings. Empty list means no failure signal was found.
+    :rtype: list[str]
+    """
+    found = []
+    if not isinstance(agent_response_msgs, list):
+        return found
+    for msg in agent_response_msgs:
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        for block in content:
+            if not isinstance(block, dict):
+                continue
+            if block.get("type") in ("tool_call", "tool_result"):
+                status = block.get("status")
+                if isinstance(status, str) and status.lower() in _FAILED_TOOL_STATUSES:
+                    found.append(status.lower())
+    return found
+
+
 def _get_tool_calls_results(agent_response_msgs):
     """Extract formatted agent tool calls and results from response."""
     agent_response_text = []
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
index 7ebc20c7e130..198fefde02d1 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py
@@ -92,7 +92,7 @@ def __init__(
         self._validator = ToolDefinitionsValidator(
             error_target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
             optional_tool_definitions=False,
-            check_for_unsupported_tools=True,
+            check_for_unsupported_tools=False,
         )
 
         super().__init__(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py
index bae6c9895046..9dc1249dff60 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_version.py
@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
 
-VERSION = "1.17.0"
+VERSION = "1.17.1"
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py
new file mode 100644
index 000000000000..a4bfeb2e3e1a
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_success_evaluator.py
@@ -0,0 +1,172 @@
+from unittest.mock import MagicMock
+
+import pytest
+
+from azure.ai.evaluation import _ToolCallSuccessEvaluator
+from azure.ai.evaluation._evaluators._tool_call_success._tool_call_success import (
+    _collect_failed_tool_statuses,
+)
+
+
+# Default prompty mock that always grades as PASS. Tests that exercise the
+# deterministic short-circuit path rely on this mock NOT being called.
+async def _flow_pass(timeout, **kwargs):  # pylint: disable=unused-argument
+    return {
+        "llm_output": {
+            "reason": "All tool calls completed successfully.",
+            "score": 1,
+            "properties": {},
+        }
+    }
+
+
+def _assistant_tool_call(tool_call_id="call_1", name="get_weather", arguments=None, status=None):
+    block = {
+        "type": "tool_call",
+        "tool_call_id": tool_call_id,
+        "name": name,
+        "arguments": arguments or {"location": "NYC"},
+    }
+    if status is not None:
+        block["status"] = status
+    return {"role": "assistant", "content": [block]}
+
+
+def _tool_result(tool_call_id="call_1", result="72F sunny", status=None):
+    block = {"type": "tool_result", "tool_result": result}
+    if status is not None:
+        block["status"] = status
+    return {"role": "tool", "tool_call_id": tool_call_id, "content": [block]}
+
+
+# ---------------------------------------------------------------------------
+# _collect_failed_tool_statuses
+# ---------------------------------------------------------------------------
+
+
+class TestCollectFailedToolStatuses:
+    def test_no_status_returns_empty(self):
+        msgs = [_assistant_tool_call(), _tool_result()]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    def test_completed_status_returns_empty(self):
+        msgs = [
+            _assistant_tool_call(status="completed"),
+            _tool_result(status="completed"),
+        ]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    @pytest.mark.parametrize("status", ["failed", "error", "incomplete", "cancelled", "canceled"])
+    def test_known_failure_status_on_tool_call_is_collected(self, status):
+        msgs = [_assistant_tool_call(status=status)]
+        assert _collect_failed_tool_statuses(msgs) == [status]
+
+    @pytest.mark.parametrize("status", ["FAILED", "Error", "Incomplete"])
+    def test_failure_status_is_case_insensitive(self, status):
+        msgs = [_assistant_tool_call(status=status)]
+        assert _collect_failed_tool_statuses(msgs) == [status.lower()]
+
+    def test_failure_status_on_tool_result_is_collected(self):
+        msgs = [_assistant_tool_call(), _tool_result(status="failed")]
+        assert _collect_failed_tool_statuses(msgs) == ["failed"]
+
+    def test_unknown_status_string_is_ignored(self):
+        msgs = [_assistant_tool_call(status="something_weird")]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    def test_non_string_status_is_ignored(self):
+        msgs = [_assistant_tool_call(status=500)]
+        assert _collect_failed_tool_statuses(msgs) == []
+
+    def test_malformed_inputs_are_tolerated(self):
+        # Non-list input
+        assert _collect_failed_tool_statuses(None) == []
+        assert _collect_failed_tool_statuses("not a list") == []
+        # List with non-dict items + dicts with non-list content
+        msgs = [
+            "string entry",
+            42,
+            {"role": "assistant"},  # no content
+            {"role": "assistant", "content": "not a list"},
+            {"role": "assistant", "content": [None, 1, "x", _assistant_tool_call(status="failed")["content"][0]]},
+        ]
+        assert _collect_failed_tool_statuses(msgs) == ["failed"]
+
+
+# ---------------------------------------------------------------------------
+# _do_eval short-circuit
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.usefixtures("mock_model_config")
+@pytest.mark.unittest
+class TestToolCallSuccessShortCircuit:
+    def test_short_circuits_on_failed_tool_call_status(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [_assistant_tool_call(status="failed"), _tool_result()]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_not_called()
+        assert result["tool_call_success"] == 0.0
+        assert result["tool_call_success_score"] == 0.0
+        assert result["tool_call_success_passed"] is False
+        assert result["tool_call_success_result"] == "fail"
+        assert result["tool_call_success_status"] == "completed"
+        assert "failed" in result["tool_call_success_reason"]
+        props = result["tool_call_success_properties"]
+        assert props["short_circuit"] == "tool_status"
+        assert props["failed_statuses"] == ["failed"]
+
+    def test_short_circuits_on_failed_tool_result_status(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [_assistant_tool_call(), _tool_result(status="error")]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_not_called()
+        assert result["tool_call_success_result"] == "fail"
+        assert result["tool_call_success_properties"]["failed_statuses"] == ["error"]
+
+    def test_dedupes_and_sorts_failed_statuses_in_reason(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [
+            _assistant_tool_call(tool_call_id="a", status="failed"),
+            _tool_result(tool_call_id="a", status="failed"),
+            _assistant_tool_call(tool_call_id="b", status="error"),
+        ]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_not_called()
+        # Reason joins deduped, sorted statuses
+        assert "error, failed" in result["tool_call_success_reason"]
+        assert result["tool_call_success_properties"]["failed_statuses"] == ["error", "failed"]
+
+    def test_no_short_circuit_when_all_statuses_completed(self, mock_model_config):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [
+            _assistant_tool_call(status="completed"),
+            _tool_result(status="completed"),
+        ]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_called_once()  # Goes to LLM
+        assert result["tool_call_success_passed"] is True
+
+    def test_no_short_circuit_when_status_absent(self, mock_model_config):
+        """Back-compat: traces produced by converters that do not preserve
+        ``status`` continue to be graded by the LLM as before."""
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        evaluator._flow = MagicMock(side_effect=_flow_pass)
+
+        response = [_assistant_tool_call(), _tool_result()]
+        result = evaluator(response=response)
+
+        evaluator._flow.assert_called_once()
+        assert result["tool_call_success_passed"] is True
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py
new file mode 100644
index 000000000000..74bc352448a3
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_unsupported_tools_validation.py
@@ -0,0 +1,183 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""
+Regression tests for the change that lets the three tool evaluators
+(ToolCallAccuracy, _ToolInputAccuracy, _ToolCallSuccess) accept
+conversations containing restricted built-in tools.
+
+These evaluators previously rejected any conversation containing tools in
+``ConversationValidator.UNSUPPORTED_TOOLS`` (e.g. ``bing_grounding``,
+``azure_ai_search``). Because none of the three grades require the
+(redacted) tool output body, the rejection has been lifted by setting
+``check_for_unsupported_tools=False`` on each evaluator's input validator.
+
+The tests below exercise the validator directly so they do not need the
+prompty flow or a real model deployment. They also confirm that the
+underlying validator class still rejects restricted tools when
+``check_for_unsupported_tools=True``, so the behavior change is limited
+to the evaluator wiring.
+"""
+
+import pytest
+
+from azure.ai.evaluation import ToolCallAccuracyEvaluator
+from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator
+from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
+from azure.ai.evaluation._evaluators._common._validators import (
+    ToolCallsValidator,
+    ToolDefinitionsValidator,
+)
+from azure.ai.evaluation._exceptions import ErrorTarget, EvaluationException
+
+
+RESTRICTED_TOOL_NAMES = [
+    "bing_grounding",
+    "bing_custom_search",
+    "azure_ai_search",
+    "azure_fabric",
+    "sharepoint_grounding",
+]
+
+
+def _restricted_response(tool_name: str):
+    return [
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "tool_call",
+                    "tool_call_id": "call_restricted",
+                    "name": tool_name,
+                    "arguments": {"query": "anything"},
+                }
+            ],
+        }
+    ]
+
+
+def _restricted_tool_definition(tool_name: str):
+    return {
+        "name": tool_name,
+        "description": f"Built-in {tool_name} tool.",
+        "parameters": {
+            "type": "object",
+            "properties": {"query": {"type": "string"}},
+        },
+    }
+
+
+@pytest.mark.usefixtures("mock_model_config")
+@pytest.mark.unittest
+class TestRestrictedToolValidationLifted:
+    """Validator should no longer reject restricted tools for these three evaluators."""
+
+    @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES)
+    def test_tool_call_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name):
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        eval_input = {
+            "query": "Look it up.",
+            "response": _restricted_response(tool_name),
+            "tool_definitions": [_restricted_tool_definition(tool_name)],
+        }
+        # Should not raise EvaluationException; flag flip made this path legal.
+        assert evaluator._validator.validate_eval_input(eval_input) is True
+
+    @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES)
+    def test_tool_input_accuracy_accepts_restricted_tool(self, mock_model_config, tool_name):
+        evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
+        eval_input = {
+            "query": "Look it up.",
+            "response": _restricted_response(tool_name),
+            "tool_definitions": [_restricted_tool_definition(tool_name)],
+        }
+        assert evaluator._validator.validate_eval_input(eval_input) is True
+
+    @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES)
+    def test_tool_call_success_accepts_restricted_tool(self, mock_model_config, tool_name):
+        evaluator = _ToolCallSuccessEvaluator(model_config=mock_model_config)
+        eval_input = {
+            "response": _restricted_response(tool_name),
+            "tool_definitions": [_restricted_tool_definition(tool_name)],
+        }
+        assert evaluator._validator.validate_eval_input(eval_input) is True
+
+    def test_mixed_function_and_restricted_tool_accepted(self, mock_model_config):
+        """Conversation containing both a function call and a restricted tool call validates cleanly."""
+        evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
+        eval_input = {
+            "query": "Find stock price and weather.",
+            "response": [
+                {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "tool_call",
+                            "tool_call_id": "call_func",
+                            "name": "get_weather",
+                            "arguments": {"location": "Paris"},
+                        },
+                        {
+                            "type": "tool_call",
+                            "tool_call_id": "call_restricted",
+                            "name": "bing_grounding",
+                            "arguments": {"query": "MSFT stock price"},
+                        },
+                    ],
+                }
+            ],
+            "tool_definitions": [
+                {
+                    "name": "get_weather",
+                    "type": "function",
+                    "description": "Weather lookup.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"location": {"type": "string"}},
+                        "required": ["location"],
+                    },
+                },
+                _restricted_tool_definition("bing_grounding"),
+            ],
+        }
+        assert evaluator._validator.validate_eval_input(eval_input) is True
+
+
+@pytest.mark.unittest
+class TestUnderlyingValidatorUnchanged:
+    """The validator class itself still rejects restricted tools when the flag is on.
+
+    Ensures the behavior change is limited to per-evaluator wiring; the validator
+    keeps its option to enforce the restricted-tool block for other consumers
+    (e.g. GroundednessEvaluator).
+    """
+
+    @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES)
+    def test_tool_calls_validator_still_rejects_when_flag_enabled(self, tool_name):
+        validator = ToolCallsValidator(
+            error_target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+            check_for_unsupported_tools=True,
+        )
+        eval_input = {
+            "query": "Look it up.",
+            "response": _restricted_response(tool_name),
+            "tool_definitions": [_restricted_tool_definition(tool_name)],
+        }
+        with pytest.raises(EvaluationException) as exc_info:
+            validator.validate_eval_input(eval_input)
+        assert "currently not supported" in str(exc_info.value)
+
+    @pytest.mark.parametrize("tool_name", RESTRICTED_TOOL_NAMES)
+    def test_tool_definitions_validator_still_rejects_when_flag_enabled(self, tool_name):
+        validator = ToolDefinitionsValidator(
+            error_target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
+            requires_query=False,
+            check_for_unsupported_tools=True,
+        )
+        eval_input = {
+            "response": _restricted_response(tool_name),
+            "tool_definitions": [_restricted_tool_definition(tool_name)],
+        }
+        with pytest.raises(EvaluationException) as exc_info:
+            validator.validate_eval_input(eval_input)
+        assert "currently not supported" in str(exc_info.value)