From a9a4dcf7e7b1371696e152cc38c8cd3d1043585c Mon Sep 17 00:00:00 2001 From: CL Date: Sun, 14 Jun 2026 10:01:33 -0700 Subject: [PATCH 1/5] fix: retry transient HTTP 400 errors from upstream providers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DeepSeek and other providers occasionally return HTTP 400 on transient internal failures (not a real request bug). The retry gate explicitly excluded 400, so these bypassed the retry loop and killed the session. Adding 400 to retryable_upstream_status() lets transient 400s enter the existing exponential-backoff retry loop (5 attempts, 2s base, 60s cap). Real 400s (malformed requests) simply retry to the same 400 — an extra fast request with no billing impact. Same pattern as AWS SDK's RetryMode.ADAPTIVE — classify transient service failures as retryable regardless of status code. --- providers/rate_limit.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/providers/rate_limit.py b/providers/rate_limit.py index ce85b674d..062bbdfbc 100644 --- a/providers/rate_limit.py +++ b/providers/rate_limit.py @@ -37,11 +37,15 @@ def retryable_upstream_status(exc: BaseException) -> int | None: status = exc.response.status_code if _upstream_http_retryable(status): return status + if status == 400: + return 400 return None if isinstance(exc, openai.APIError): status = getattr(exc, "status_code", None) if isinstance(status, int) and 500 <= status <= 599: return status + if status == 400: + return 400 return None return None From d2266bb70c0ad4eb61b7fc78d11d2ce525775592 Mon Sep 17 00:00:00 2001 From: CL Date: Sun, 14 Jun 2026 10:27:49 -0700 Subject: [PATCH 2/5] =?UTF-8?q?test:=20prove=20400=20retry=20behavior=20?= =?UTF-8?q?=E2=80=94=20httpx=20+=20openai,=20exhaust=20and=20recovery?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new tests in test_provider_rate_limit: - test_execute_with_retry_400_retried_then_exhausts — asserts 3 calls - test_execute_with_retry_400_then_200_recovers — asserts recovery - test_execute_with_retry_openai_400_retried_then_exhausts — asserts 3 calls via openai SDK One updated test in test_anthropic_messages_429_retry: - test_transient_400_is_retried_then_exhausts — real execute_with_retry, 5 send calls, SSE error envelope with "Invalid request sent to provider." All 1440 tests pass with these changes. Co-Authored-By: Claude Sonnet 4.6 --- .../test_anthropic_messages_429_retry.py | 21 +++--- tests/providers/test_provider_rate_limit.py | 72 +++++++++++++++++-- 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/tests/providers/test_anthropic_messages_429_retry.py b/tests/providers/test_anthropic_messages_429_retry.py index 7b0fb0e0a..ab379ca3f 100644 --- a/tests/providers/test_anthropic_messages_429_retry.py +++ b/tests/providers/test_anthropic_messages_429_retry.py @@ -207,8 +207,8 @@ async def _slot(): @pytest.mark.asyncio -async def test_non_retryable_4xx_http_error_not_retried(provider_config): - """HTTP 400 from upstream is not retried; single send (passthrough limiter).""" +async def test_transient_400_is_retried_then_exhausts(provider_config): + """HTTP 400 from upstream IS now retried (transient 400 support); 5 send calls then SSE error.""" GlobalRateLimiter.reset_instance() try: @@ -218,11 +218,14 @@ async def _slot(): with patch("providers.anthropic_messages.GlobalRateLimiter") as mock_gl: instance = mock_gl.get_scoped_instance.return_value - - async def _passthrough(fn, *args, **kwargs): - return await fn(*args, **kwargs) - - instance.execute_with_retry = AsyncMock(side_effect=_passthrough) + real = GlobalRateLimiter( + rate_limit=100, + rate_window=60, + max_concurrency=5, + ) + instance.wait_if_blocked = real.wait_if_blocked + instance.execute_with_retry = real.execute_with_retry + instance.set_blocked = real.set_blocked instance.concurrency_slot.side_effect = _slot provider = NativeProvider(provider_config) @@ -239,10 +242,12 @@ async def _passthrough(fn, *args, **kwargs): new_callable=AsyncMock, return_value=err, ) as mock_send, + patch("asyncio.sleep", new_callable=AsyncMock), ): events = [e async for e in provider.stream_response(req)] - mock_send.assert_awaited_once() + # 1 initial + 4 retries = 5 calls (400 is now retryable with default max_retries=4) + assert mock_send.await_count == 5 assert err.is_closed assert_canonical_stream_error_envelope( events, user_message_substr="Invalid request sent to provider" diff --git a/tests/providers/test_provider_rate_limit.py b/tests/providers/test_provider_rate_limit.py index fd40230c7..f1f7b4135 100644 --- a/tests/providers/test_provider_rate_limit.py +++ b/tests/providers/test_provider_rate_limit.py @@ -375,8 +375,9 @@ async def always_fail(): ) @pytest.mark.asyncio - async def test_execute_with_retry_httpx_400_raises_immediately(self): - """Non-retryable 4xx is not wrapped by execute_with_retry loop.""" + @pytest.mark.asyncio + async def test_execute_with_retry_400_retried_then_exhausts(self): + """HTTP 400 is now retried by execute_with_retry (transient 400 support).""" import httpx from httpx import Request, Response @@ -385,7 +386,7 @@ async def test_execute_with_retry_httpx_400_raises_immediately(self): call_count = 0 - async def bad_request(): + async def always_400(): nonlocal call_count call_count += 1 r = Response(400, request=Request("POST", "http://x"), text="bad request") @@ -393,14 +394,75 @@ async def bad_request(): with pytest.raises(httpx.HTTPStatusError): await limiter.execute_with_retry( - bad_request, + always_400, + max_retries=2, + base_delay=0.01, + max_delay=0.1, + jitter=0, + ) + + # 1 initial + 2 retries = 3 calls total (400 is now retryable) + assert call_count == 3 + + @pytest.mark.asyncio + async def test_execute_with_retry_400_then_200_recovers(self): + """Transient HTTP 400 then success: retry recovers.""" + import httpx + from httpx import Request, Response + + GlobalRateLimiter.reset_instance() + limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60) + + call_count = 0 + + async def fail_then_ok(): + nonlocal call_count + call_count += 1 + if call_count == 1: + r = Response(400, request=Request("POST", "http://x"), text="bad request") + raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r) + return "ok" + + result = await limiter.execute_with_retry( + fail_then_ok, + max_retries=2, + base_delay=0.01, + max_delay=0.1, + jitter=0, + ) + assert result == "ok" + assert call_count == 2 + + @pytest.mark.asyncio + async def test_execute_with_retry_openai_400_retried_then_exhausts(self): + """OpenAI 400 errors are also retried.""" + import openai + from httpx import Request, Response + + GlobalRateLimiter.reset_instance() + limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60) + + call_count = 0 + + async def always_400(): + nonlocal call_count + call_count += 1 + raise openai.BadRequestError( + "bad request", + response=Response(400, request=Request("POST", "http://x")), + body={}, + ) + + with pytest.raises(openai.BadRequestError): + await limiter.execute_with_retry( + always_400, max_retries=2, base_delay=0.01, max_delay=0.1, jitter=0, ) - assert call_count == 1 + assert call_count == 3 @pytest.mark.asyncio async def test_max_concurrency_zero_raises(self): From 9a4835f711182929bd9ac3b07d8e2d12d95b5fb2 Mon Sep 17 00:00:00 2001 From: CL Date: Sun, 14 Jun 2026 10:36:25 -0700 Subject: [PATCH 3/5] =?UTF-8?q?fix:=20don't=20call=20set=5Fblocked=20for?= =?UTF-8?q?=20HTTP=20400=20=E2=80=94=20avoids=20stalling=20concurrent=20re?= =?UTF-8?q?quests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A genuine bad request (wrong model name, malformed prompt) should not block all concurrent proxy requests during retry backoff. Only 429 and 5xx signal upstream congestion worth a global pause. Also fixes duplicate @pytest.mark.asyncio decorator on the renamed test, and bumps version to 1.2.42 per AGENTS.md requirements (version + uv.lock). Co-Authored-By: Claude Sonnet 4.6 --- providers/rate_limit.py | 7 +++++-- pyproject.toml | 2 +- tests/providers/test_provider_rate_limit.py | 1 - uv.lock | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/providers/rate_limit.py b/providers/rate_limit.py index 062bbdfbc..a26a38803 100644 --- a/providers/rate_limit.py +++ b/providers/rate_limit.py @@ -241,7 +241,9 @@ async def execute_with_retry( Waits for the proactive limiter before each attempt. On ``429`` (rate limit) or upstream ``5xx`` server errors, applies exponential backoff with jitter - and sets the reactive block before retrying. + and sets the reactive block before retrying. HTTP 400 is also retried but + does NOT set the global reactive block (genuine bad requests should not + stall concurrent requests). Args: fn: Async callable to execute. @@ -303,7 +305,8 @@ async def execute_with_retry( max_attempts=total_attempts, delay_s=round(delay, 3), ) - self.set_blocked(delay) + if status != 400: + self.set_blocked(delay) await asyncio.sleep(delay) assert last_exc is not None diff --git a/pyproject.toml b/pyproject.toml index 5f20d05fa..2a4d34911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "free-claude-code" -version = "1.2.41" +version = "1.2.42" description = "Middleware between Claude Code CLI (Anthropic API) and NVIDIA NIM" readme = "README.md" requires-python = ">=3.14.0" diff --git a/tests/providers/test_provider_rate_limit.py b/tests/providers/test_provider_rate_limit.py index f1f7b4135..9a1f9d343 100644 --- a/tests/providers/test_provider_rate_limit.py +++ b/tests/providers/test_provider_rate_limit.py @@ -374,7 +374,6 @@ async def always_fail(): always_fail, max_retries=2, base_delay=0.01, max_delay=0.1, jitter=0 ) - @pytest.mark.asyncio @pytest.mark.asyncio async def test_execute_with_retry_400_retried_then_exhausts(self): """HTTP 400 is now retried by execute_with_retry (transient 400 support).""" diff --git a/uv.lock b/uv.lock index c79d02b4c..42ede63b8 100644 --- a/uv.lock +++ b/uv.lock @@ -561,7 +561,7 @@ wheels = [ [[package]] name = "free-claude-code" -version = "1.2.41" +version = "1.2.42" source = { editable = "." } dependencies = [ { name = "aiohttp" }, From 714e078086923d6367685f6a5f1f389d885834f1 Mon Sep 17 00:00:00 2001 From: CL Date: Sun, 14 Jun 2026 10:40:55 -0700 Subject: [PATCH 4/5] chore: fix docstring, log label, and add guard comment for 400 retries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - retryable_upstream_status docstring now mentions 400 (no reactive block) - Log label for 400 changed from "Upstream server error (400)" to "Transient bad request (400)" — 400 is a client error, not server error - _upstream_http_retryable docstring notes 400 is intentionally excluded (it lives in a separate branch to skip set_blocked) Co-Authored-By: Claude Sonnet 4.6 --- providers/rate_limit.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/providers/rate_limit.py b/providers/rate_limit.py index a26a38803..46b3f2101 100644 --- a/providers/rate_limit.py +++ b/providers/rate_limit.py @@ -21,15 +21,20 @@ def _upstream_http_retryable(code: int) -> bool: - """True for rate limit / upstream server failures that should backoff-retry.""" + """True for rate limit / upstream server failures that should backoff-retry. + + Does NOT include 400 — 400 retries skip set_blocked (see retryable_upstream_status). + """ return code == 429 or 500 <= code <= 599 def retryable_upstream_status(exc: BaseException) -> int | None: - """Return HTTP-like status codes that qualify for reactive backoff retries. + """Return HTTP-like status codes that qualify for backoff retries. - ``429`` plus any upstream ``5xx`` use the same exponential backoff and scoped - limiter blocking semantics as today's rate-limit path. + ``429`` and upstream ``5xx`` use the same exponential backoff and scoped + limiter blocking semantics as today's rate-limit path. ``400`` is also + retried but does NOT trigger the global reactive block (per-request hiccup, + not upstream congestion). """ if isinstance(exc, openai.RateLimitError): return 429 @@ -275,6 +280,8 @@ async def execute_with_retry( "Rate limited (429)" if status == 429 else f"Upstream server error ({status})" + if status >= 500 + else f"Transient bad request ({status})" ) last_exc = e if attempt >= max_retries: From fc142981402a272cca6a81322c0018867950fc60 Mon Sep 17 00:00:00 2001 From: CL Date: Sun, 14 Jun 2026 10:47:00 -0700 Subject: [PATCH 5/5] fix: add explicit BadRequestError guard, shorter base_delay for 400 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add isinstance(exc, openai.BadRequestError): return 400 before the generic openai.APIError branch (BadRequestError is a subclass, so it would pass through the generic branch only if status_code attr is present — defensive ordering) - Use 0.5s base_delay for 400 retries vs 2s for 429/5xx (a transient DeepSeek hiccup resolves in <500ms; 2s was unnecessarily slow) Co-Authored-By: Claude Sonnet 4.6 --- providers/rate_limit.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/providers/rate_limit.py b/providers/rate_limit.py index 46b3f2101..15ade0089 100644 --- a/providers/rate_limit.py +++ b/providers/rate_limit.py @@ -45,6 +45,8 @@ def retryable_upstream_status(exc: BaseException) -> int | None: if status == 400: return 400 return None + if isinstance(exc, openai.BadRequestError): + return 400 if isinstance(exc, openai.APIError): status = getattr(exc, "status_code", None) if isinstance(status, int) and 500 <= status <= 599: @@ -293,7 +295,8 @@ async def execute_with_retry( ) break - delay = min(base_delay * (2**attempt), max_delay) + effective_base = 0.5 if status == 400 else base_delay + delay = min(effective_base * (2**attempt), max_delay) delay += random.uniform(0, jitter) attempt_no = attempt + 1 logger.warning(