From a9a4dcf7e7b1371696e152cc38c8cd3d1043585c Mon Sep 17 00:00:00 2001
From: CL <blueboobyai@gmail.com>
Date: Sun, 14 Jun 2026 10:01:33 -0700
Subject: [PATCH 1/5] fix: retry transient HTTP 400 errors from upstream
 providers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DeepSeek and other providers occasionally return HTTP 400 on transient
internal failures (not a real request bug). The retry gate explicitly
excluded 400, so these bypassed the retry loop and killed the session.

Adding 400 to retryable_upstream_status() lets transient 400s enter the
existing exponential-backoff retry loop (5 attempts, 2s base, 60s cap).
Real 400s (malformed requests) simply retry to the same 400 — an extra
fast request with no billing impact.

Same pattern as AWS SDK's RetryMode.ADAPTIVE — classify transient service
failures as retryable regardless of status code.
---
 providers/rate_limit.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/providers/rate_limit.py b/providers/rate_limit.py
index ce85b674d..062bbdfbc 100644
--- a/providers/rate_limit.py
+++ b/providers/rate_limit.py
@@ -37,11 +37,15 @@ def retryable_upstream_status(exc: BaseException) -> int | None:
         status = exc.response.status_code
         if _upstream_http_retryable(status):
             return status
+        if status == 400:
+            return 400
         return None
     if isinstance(exc, openai.APIError):
         status = getattr(exc, "status_code", None)
         if isinstance(status, int) and 500 <= status <= 599:
             return status
+        if status == 400:
+            return 400
         return None
     return None
 

From d2266bb70c0ad4eb61b7fc78d11d2ce525775592 Mon Sep 17 00:00:00 2001
From: CL <blueboobyai@gmail.com>
Date: Sun, 14 Jun 2026 10:27:49 -0700
Subject: [PATCH 2/5] =?UTF-8?q?test:=20prove=20400=20retry=20behavior=20?=
 =?UTF-8?q?=E2=80=94=20httpx=20+=20openai,=20exhaust=20and=20recovery?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three new tests in test_provider_rate_limit:
- test_execute_with_retry_400_retried_then_exhausts — asserts 3 calls
- test_execute_with_retry_400_then_200_recovers — asserts recovery
- test_execute_with_retry_openai_400_retried_then_exhausts — asserts 3 calls via openai SDK

One updated test in test_anthropic_messages_429_retry:
- test_transient_400_is_retried_then_exhausts — real execute_with_retry, 5 send calls, SSE error envelope with "Invalid request sent to provider."

All 1440 tests pass with these changes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../test_anthropic_messages_429_retry.py      | 21 +++---
 tests/providers/test_provider_rate_limit.py   | 72 +++++++++++++++++--
 2 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/tests/providers/test_anthropic_messages_429_retry.py b/tests/providers/test_anthropic_messages_429_retry.py
index 7b0fb0e0a..ab379ca3f 100644
--- a/tests/providers/test_anthropic_messages_429_retry.py
+++ b/tests/providers/test_anthropic_messages_429_retry.py
@@ -207,8 +207,8 @@ async def _slot():
 
 
 @pytest.mark.asyncio
-async def test_non_retryable_4xx_http_error_not_retried(provider_config):
-    """HTTP 400 from upstream is not retried; single send (passthrough limiter)."""
+async def test_transient_400_is_retried_then_exhausts(provider_config):
+    """HTTP 400 from upstream IS now retried (transient 400 support); 5 send calls then SSE error."""
     GlobalRateLimiter.reset_instance()
     try:
 
@@ -218,11 +218,14 @@ async def _slot():
 
         with patch("providers.anthropic_messages.GlobalRateLimiter") as mock_gl:
             instance = mock_gl.get_scoped_instance.return_value
-
-            async def _passthrough(fn, *args, **kwargs):
-                return await fn(*args, **kwargs)
-
-            instance.execute_with_retry = AsyncMock(side_effect=_passthrough)
+            real = GlobalRateLimiter(
+                rate_limit=100,
+                rate_window=60,
+                max_concurrency=5,
+            )
+            instance.wait_if_blocked = real.wait_if_blocked
+            instance.execute_with_retry = real.execute_with_retry
+            instance.set_blocked = real.set_blocked
             instance.concurrency_slot.side_effect = _slot
 
             provider = NativeProvider(provider_config)
@@ -239,10 +242,12 @@ async def _passthrough(fn, *args, **kwargs):
                     new_callable=AsyncMock,
                     return_value=err,
                 ) as mock_send,
+                patch("asyncio.sleep", new_callable=AsyncMock),
             ):
                 events = [e async for e in provider.stream_response(req)]
 
-            mock_send.assert_awaited_once()
+            # 1 initial + 4 retries = 5 calls (400 is now retryable with default max_retries=4)
+            assert mock_send.await_count == 5
             assert err.is_closed
             assert_canonical_stream_error_envelope(
                 events, user_message_substr="Invalid request sent to provider"
diff --git a/tests/providers/test_provider_rate_limit.py b/tests/providers/test_provider_rate_limit.py
index fd40230c7..f1f7b4135 100644
--- a/tests/providers/test_provider_rate_limit.py
+++ b/tests/providers/test_provider_rate_limit.py
@@ -375,8 +375,9 @@ async def always_fail():
             )
 
     @pytest.mark.asyncio
-    async def test_execute_with_retry_httpx_400_raises_immediately(self):
-        """Non-retryable 4xx is not wrapped by execute_with_retry loop."""
+    @pytest.mark.asyncio
+    async def test_execute_with_retry_400_retried_then_exhausts(self):
+        """HTTP 400 is now retried by execute_with_retry (transient 400 support)."""
         import httpx
         from httpx import Request, Response
 
@@ -385,7 +386,7 @@ async def test_execute_with_retry_httpx_400_raises_immediately(self):
 
         call_count = 0
 
-        async def bad_request():
+        async def always_400():
             nonlocal call_count
             call_count += 1
             r = Response(400, request=Request("POST", "http://x"), text="bad request")
@@ -393,14 +394,75 @@ async def bad_request():
 
         with pytest.raises(httpx.HTTPStatusError):
             await limiter.execute_with_retry(
-                bad_request,
+                always_400,
+                max_retries=2,
+                base_delay=0.01,
+                max_delay=0.1,
+                jitter=0,
+            )
+
+        # 1 initial + 2 retries = 3 calls total (400 is now retryable)
+        assert call_count == 3
+
+    @pytest.mark.asyncio
+    async def test_execute_with_retry_400_then_200_recovers(self):
+        """Transient HTTP 400 then success: retry recovers."""
+        import httpx
+        from httpx import Request, Response
+
+        GlobalRateLimiter.reset_instance()
+        limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60)
+
+        call_count = 0
+
+        async def fail_then_ok():
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                r = Response(400, request=Request("POST", "http://x"), text="bad request")
+                raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r)
+            return "ok"
+
+        result = await limiter.execute_with_retry(
+            fail_then_ok,
+            max_retries=2,
+            base_delay=0.01,
+            max_delay=0.1,
+            jitter=0,
+        )
+        assert result == "ok"
+        assert call_count == 2
+
+    @pytest.mark.asyncio
+    async def test_execute_with_retry_openai_400_retried_then_exhausts(self):
+        """OpenAI 400 errors are also retried."""
+        import openai
+        from httpx import Request, Response
+
+        GlobalRateLimiter.reset_instance()
+        limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60)
+
+        call_count = 0
+
+        async def always_400():
+            nonlocal call_count
+            call_count += 1
+            raise openai.BadRequestError(
+                "bad request",
+                response=Response(400, request=Request("POST", "http://x")),
+                body={},
+            )
+
+        with pytest.raises(openai.BadRequestError):
+            await limiter.execute_with_retry(
+                always_400,
                 max_retries=2,
                 base_delay=0.01,
                 max_delay=0.1,
                 jitter=0,
             )
 
-        assert call_count == 1
+        assert call_count == 3
 
     @pytest.mark.asyncio
     async def test_max_concurrency_zero_raises(self):

From 9a4835f711182929bd9ac3b07d8e2d12d95b5fb2 Mon Sep 17 00:00:00 2001
From: CL <blueboobyai@gmail.com>
Date: Sun, 14 Jun 2026 10:36:25 -0700
Subject: [PATCH 3/5] =?UTF-8?q?fix:=20don't=20call=20set=5Fblocked=20for?=
 =?UTF-8?q?=20HTTP=20400=20=E2=80=94=20avoids=20stalling=20concurrent=20re?=
 =?UTF-8?q?quests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A genuine bad request (wrong model name, malformed prompt) should not
block all concurrent proxy requests during retry backoff. Only 429 and
5xx signal upstream congestion worth a global pause.

Also fixes duplicate @pytest.mark.asyncio decorator on the renamed test,
and bumps version to 1.2.42 per AGENTS.md requirements (version + uv.lock).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 providers/rate_limit.py                     | 7 +++++--
 pyproject.toml                              | 2 +-
 tests/providers/test_provider_rate_limit.py | 1 -
 uv.lock                                     | 2 +-
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/providers/rate_limit.py b/providers/rate_limit.py
index 062bbdfbc..a26a38803 100644
--- a/providers/rate_limit.py
+++ b/providers/rate_limit.py
@@ -241,7 +241,9 @@ async def execute_with_retry(
 
         Waits for the proactive limiter before each attempt. On ``429`` (rate limit)
         or upstream ``5xx`` server errors, applies exponential backoff with jitter
-        and sets the reactive block before retrying.
+        and sets the reactive block before retrying. HTTP 400 is also retried but
+        does NOT set the global reactive block (genuine bad requests should not
+        stall concurrent requests).
 
         Args:
             fn: Async callable to execute.
@@ -303,7 +305,8 @@ async def execute_with_retry(
                     max_attempts=total_attempts,
                     delay_s=round(delay, 3),
                 )
-                self.set_blocked(delay)
+                if status != 400:
+                    self.set_blocked(delay)
                 await asyncio.sleep(delay)
 
         assert last_exc is not None
diff --git a/pyproject.toml b/pyproject.toml
index 5f20d05fa..2a4d34911 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "free-claude-code"
-version = "1.2.41"
+version = "1.2.42"
 description = "Middleware between Claude Code CLI (Anthropic API) and NVIDIA NIM"
 readme = "README.md"
 requires-python = ">=3.14.0"
diff --git a/tests/providers/test_provider_rate_limit.py b/tests/providers/test_provider_rate_limit.py
index f1f7b4135..9a1f9d343 100644
--- a/tests/providers/test_provider_rate_limit.py
+++ b/tests/providers/test_provider_rate_limit.py
@@ -374,7 +374,6 @@ async def always_fail():
                 always_fail, max_retries=2, base_delay=0.01, max_delay=0.1, jitter=0
             )
 
-    @pytest.mark.asyncio
     @pytest.mark.asyncio
     async def test_execute_with_retry_400_retried_then_exhausts(self):
         """HTTP 400 is now retried by execute_with_retry (transient 400 support)."""
diff --git a/uv.lock b/uv.lock
index c79d02b4c..42ede63b8 100644
--- a/uv.lock
+++ b/uv.lock
@@ -561,7 +561,7 @@ wheels = [
 
 [[package]]
 name = "free-claude-code"
-version = "1.2.41"
+version = "1.2.42"
 source = { editable = "." }
 dependencies = [
     { name = "aiohttp" },

From 714e078086923d6367685f6a5f1f389d885834f1 Mon Sep 17 00:00:00 2001
From: CL <blueboobyai@gmail.com>
Date: Sun, 14 Jun 2026 10:40:55 -0700
Subject: [PATCH 4/5] chore: fix docstring, log label, and add guard comment
 for 400 retries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- retryable_upstream_status docstring now mentions 400 (no reactive block)
- Log label for 400 changed from "Upstream server error (400)" to
  "Transient bad request (400)" — 400 is a client error, not server error
- _upstream_http_retryable docstring notes 400 is intentionally excluded
  (it lives in a separate branch to skip set_blocked)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 providers/rate_limit.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/providers/rate_limit.py b/providers/rate_limit.py
index a26a38803..46b3f2101 100644
--- a/providers/rate_limit.py
+++ b/providers/rate_limit.py
@@ -21,15 +21,20 @@
 
 
 def _upstream_http_retryable(code: int) -> bool:
-    """True for rate limit / upstream server failures that should backoff-retry."""
+    """True for rate limit / upstream server failures that should backoff-retry.
+
+    Does NOT include 400 — 400 retries skip set_blocked (see retryable_upstream_status).
+    """
     return code == 429 or 500 <= code <= 599
 
 
 def retryable_upstream_status(exc: BaseException) -> int | None:
-    """Return HTTP-like status codes that qualify for reactive backoff retries.
+    """Return HTTP-like status codes that qualify for backoff retries.
 
-    ``429`` plus any upstream ``5xx`` use the same exponential backoff and scoped
-    limiter blocking semantics as today's rate-limit path.
+    ``429`` and upstream ``5xx`` use the same exponential backoff and scoped
+    limiter blocking semantics as today's rate-limit path. ``400`` is also
+    retried but does NOT trigger the global reactive block (per-request hiccup,
+    not upstream congestion).
     """
     if isinstance(exc, openai.RateLimitError):
         return 429
@@ -275,6 +280,8 @@ async def execute_with_retry(
                     "Rate limited (429)"
                     if status == 429
                     else f"Upstream server error ({status})"
+                    if status >= 500
+                    else f"Transient bad request ({status})"
                 )
                 last_exc = e
                 if attempt >= max_retries:

From fc142981402a272cca6a81322c0018867950fc60 Mon Sep 17 00:00:00 2001
From: CL <blueboobyai@gmail.com>
Date: Sun, 14 Jun 2026 10:47:00 -0700
Subject: [PATCH 5/5] fix: add explicit BadRequestError guard, shorter
 base_delay for 400
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add isinstance(exc, openai.BadRequestError): return 400 before the
  generic openai.APIError branch (BadRequestError is a subclass, so
  it would pass through the generic branch only if status_code attr
  is present — defensive ordering)
- Use 0.5s base_delay for 400 retries vs 2s for 429/5xx (a transient
  DeepSeek hiccup resolves in <500ms; 2s was unnecessarily slow)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 providers/rate_limit.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/providers/rate_limit.py b/providers/rate_limit.py
index 46b3f2101..15ade0089 100644
--- a/providers/rate_limit.py
+++ b/providers/rate_limit.py
@@ -45,6 +45,8 @@ def retryable_upstream_status(exc: BaseException) -> int | None:
         if status == 400:
             return 400
         return None
+    if isinstance(exc, openai.BadRequestError):
+        return 400
     if isinstance(exc, openai.APIError):
         status = getattr(exc, "status_code", None)
         if isinstance(status, int) and 500 <= status <= 599:
@@ -293,7 +295,8 @@ async def execute_with_retry(
                     )
                     break
 
-                delay = min(base_delay * (2**attempt), max_delay)
+                effective_base = 0.5 if status == 400 else base_delay
+                delay = min(effective_base * (2**attempt), max_delay)
                 delay += random.uniform(0, jitter)
                 attempt_no = attempt + 1
                 logger.warning(