BlueBoobyAI · BlueBoobyAI · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/providers/rate_limit.py b/providers/rate_limit.py
@@ -21,27 +21,38 @@
 
 
 def _upstream_http_retryable(code: int) -> bool:
-    """True for rate limit / upstream server failures that should backoff-retry."""
+    """True for rate limit / upstream server failures that should backoff-retry.
+
+    Does NOT include 400 — 400 retries skip set_blocked (see retryable_upstream_status).
+    """
     return code == 429 or 500 <= code <= 599
 
 
 def retryable_upstream_status(exc: BaseException) -> int | None:
-    """Return HTTP-like status codes that qualify for reactive backoff retries.
+    """Return HTTP-like status codes that qualify for backoff retries.
 
-    ``429`` plus any upstream ``5xx`` use the same exponential backoff and scoped
-    limiter blocking semantics as today's rate-limit path.
+    ``429`` and upstream ``5xx`` use the same exponential backoff and scoped
+    limiter blocking semantics as today's rate-limit path. ``400`` is also
+    retried but does NOT trigger the global reactive block (per-request hiccup,
+    not upstream congestion).
     """
     if isinstance(exc, openai.RateLimitError):
         return 429
     if isinstance(exc, httpx.HTTPStatusError):
         status = exc.response.status_code
         if _upstream_http_retryable(status):
             return status
+        if status == 400:
+            return 400
         return None
+    if isinstance(exc, openai.BadRequestError):
+        return 400
     if isinstance(exc, openai.APIError):
         status = getattr(exc, "status_code", None)
         if isinstance(status, int) and 500 <= status <= 599:
             return status
+        if status == 400:
+            return 400
         return None
     return None
 
@@ -237,7 +248,9 @@ async def execute_with_retry(
 
         Waits for the proactive limiter before each attempt. On ``429`` (rate limit)
         or upstream ``5xx`` server errors, applies exponential backoff with jitter
-        and sets the reactive block before retrying.
+        and sets the reactive block before retrying. HTTP 400 is also retried but
+        does NOT set the global reactive block (genuine bad requests should not
+        stall concurrent requests).
 
         Args:
             fn: Async callable to execute.
@@ -269,6 +282,8 @@ async def execute_with_retry(
                     "Rate limited (429)"
                     if status == 429
                     else f"Upstream server error ({status})"
+                    if status >= 500
+                    else f"Transient bad request ({status})"
                 )
                 last_exc = e
                 if attempt >= max_retries:
@@ -280,7 +295,8 @@ async def execute_with_retry(
                     )
                     break
 
-                delay = min(base_delay * (2**attempt), max_delay)
+                effective_base = 0.5 if status == 400 else base_delay
+                delay = min(effective_base * (2**attempt), max_delay)
                 delay += random.uniform(0, jitter)
                 attempt_no = attempt + 1
                 logger.warning(
@@ -299,7 +315,8 @@ async def execute_with_retry(
                     max_attempts=total_attempts,
                     delay_s=round(delay, 3),
                 )
-                self.set_blocked(delay)
+                if status != 400:
+                    self.set_blocked(delay)
                 await asyncio.sleep(delay)
 
         assert last_exc is not None

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "free-claude-code"
-version = "1.2.41"
+version = "1.2.42"
 description = "Middleware between Claude Code CLI (Anthropic API) and NVIDIA NIM"
 readme = "README.md"
 requires-python = ">=3.14.0"

diff --git a/tests/providers/test_anthropic_messages_429_retry.py b/tests/providers/test_anthropic_messages_429_retry.py
@@ -207,8 +207,8 @@ async def _slot():
 
 
 @pytest.mark.asyncio
-async def test_non_retryable_4xx_http_error_not_retried(provider_config):
-    """HTTP 400 from upstream is not retried; single send (passthrough limiter)."""
+async def test_transient_400_is_retried_then_exhausts(provider_config):
+    """HTTP 400 from upstream IS now retried (transient 400 support); 5 send calls then SSE error."""
     GlobalRateLimiter.reset_instance()
     try:
 
@@ -218,11 +218,14 @@ async def _slot():
 
         with patch("providers.anthropic_messages.GlobalRateLimiter") as mock_gl:
             instance = mock_gl.get_scoped_instance.return_value
-
-            async def _passthrough(fn, *args, **kwargs):
-                return await fn(*args, **kwargs)
-
-            instance.execute_with_retry = AsyncMock(side_effect=_passthrough)
+            real = GlobalRateLimiter(
+                rate_limit=100,
+                rate_window=60,
+                max_concurrency=5,
+            )
+            instance.wait_if_blocked = real.wait_if_blocked
+            instance.execute_with_retry = real.execute_with_retry
+            instance.set_blocked = real.set_blocked
             instance.concurrency_slot.side_effect = _slot
 
             provider = NativeProvider(provider_config)
@@ -239,10 +242,12 @@ async def _passthrough(fn, *args, **kwargs):
                     new_callable=AsyncMock,
                     return_value=err,
                 ) as mock_send,
+                patch("asyncio.sleep", new_callable=AsyncMock),
             ):
                 events = [e async for e in provider.stream_response(req)]
 
-            mock_send.assert_awaited_once()
+            # 1 initial + 4 retries = 5 calls (400 is now retryable with default max_retries=4)
+            assert mock_send.await_count == 5
             assert err.is_closed
             assert_canonical_stream_error_envelope(
                 events, user_message_substr="Invalid request sent to provider"

diff --git a/tests/providers/test_provider_rate_limit.py b/tests/providers/test_provider_rate_limit.py
@@ -375,8 +375,8 @@ async def always_fail():
             )
 
     @pytest.mark.asyncio
-    async def test_execute_with_retry_httpx_400_raises_immediately(self):
-        """Non-retryable 4xx is not wrapped by execute_with_retry loop."""
+    async def test_execute_with_retry_400_retried_then_exhausts(self):
+        """HTTP 400 is now retried by execute_with_retry (transient 400 support)."""
         import httpx
         from httpx import Request, Response
 
@@ -385,22 +385,83 @@ async def test_execute_with_retry_httpx_400_raises_immediately(self):
 
         call_count = 0
 
-        async def bad_request():
+        async def always_400():
             nonlocal call_count
             call_count += 1
             r = Response(400, request=Request("POST", "http://x"), text="bad request")
             raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r)
 
         with pytest.raises(httpx.HTTPStatusError):
             await limiter.execute_with_retry(
-                bad_request,
+                always_400,
                 max_retries=2,
                 base_delay=0.01,
                 max_delay=0.1,
                 jitter=0,
             )
 
-        assert call_count == 1
+        # 1 initial + 2 retries = 3 calls total (400 is now retryable)
+        assert call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_execute_with_retry_400_then_200_recovers(self):
+        """Transient HTTP 400 then success: retry recovers."""
+        import httpx
+        from httpx import Request, Response
+
+        GlobalRateLimiter.reset_instance()
+        limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60)
+
+        call_count = 0
+
+        async def fail_then_ok():
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                r = Response(400, request=Request("POST", "http://x"), text="bad request")
+                raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r)
+            return "ok"
+
+        result = await limiter.execute_with_retry(
+            fail_then_ok,
+            max_retries=2,
+            base_delay=0.01,
+            max_delay=0.1,
+            jitter=0,
+        )
+        assert result == "ok"
+        assert call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_execute_with_retry_openai_400_retried_then_exhausts(self):
+        """OpenAI 400 errors are also retried."""
+        import openai
+        from httpx import Request, Response
+
+        GlobalRateLimiter.reset_instance()
+        limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60)
+
+        call_count = 0
+
+        async def always_400():
+            nonlocal call_count
+            call_count += 1
+            raise openai.BadRequestError(
+                "bad request",
+                response=Response(400, request=Request("POST", "http://x")),
+                body={},
+            )
+
+        with pytest.raises(openai.BadRequestError):
+            await limiter.execute_with_retry(
+                always_400,
+                max_retries=2,
+                base_delay=0.01,
+                max_delay=0.1,
+                jitter=0,
+            )
+
+        assert call_count == 3
 
     @pytest.mark.asyncio
     async def test_max_concurrency_zero_raises(self):

diff --git a/uv.lock b/uv.lock