diff --git a/providers/rate_limit.py b/providers/rate_limit.py index ce85b674..15ade008 100644 --- a/providers/rate_limit.py +++ b/providers/rate_limit.py @@ -21,15 +21,20 @@ def _upstream_http_retryable(code: int) -> bool: - """True for rate limit / upstream server failures that should backoff-retry.""" + """True for rate limit / upstream server failures that should backoff-retry. + + Does NOT include 400 — 400 retries skip set_blocked (see retryable_upstream_status). + """ return code == 429 or 500 <= code <= 599 def retryable_upstream_status(exc: BaseException) -> int | None: - """Return HTTP-like status codes that qualify for reactive backoff retries. + """Return HTTP-like status codes that qualify for backoff retries. - ``429`` plus any upstream ``5xx`` use the same exponential backoff and scoped - limiter blocking semantics as today's rate-limit path. + ``429`` and upstream ``5xx`` use the same exponential backoff and scoped + limiter blocking semantics as today's rate-limit path. ``400`` is also + retried but does NOT trigger the global reactive block (per-request hiccup, + not upstream congestion). """ if isinstance(exc, openai.RateLimitError): return 429 @@ -37,11 +42,17 @@ def retryable_upstream_status(exc: BaseException) -> int | None: status = exc.response.status_code if _upstream_http_retryable(status): return status + if status == 400: + return 400 return None + if isinstance(exc, openai.BadRequestError): + return 400 if isinstance(exc, openai.APIError): status = getattr(exc, "status_code", None) if isinstance(status, int) and 500 <= status <= 599: return status + if status == 400: + return 400 return None return None @@ -237,7 +248,9 @@ async def execute_with_retry( Waits for the proactive limiter before each attempt. On ``429`` (rate limit) or upstream ``5xx`` server errors, applies exponential backoff with jitter - and sets the reactive block before retrying. + and sets the reactive block before retrying. HTTP 400 is also retried but + does NOT set the global reactive block (genuine bad requests should not + stall concurrent requests). Args: fn: Async callable to execute. @@ -269,6 +282,8 @@ async def execute_with_retry( "Rate limited (429)" if status == 429 else f"Upstream server error ({status})" + if status >= 500 + else f"Transient bad request ({status})" ) last_exc = e if attempt >= max_retries: @@ -280,7 +295,8 @@ async def execute_with_retry( ) break - delay = min(base_delay * (2**attempt), max_delay) + effective_base = 0.5 if status == 400 else base_delay + delay = min(effective_base * (2**attempt), max_delay) delay += random.uniform(0, jitter) attempt_no = attempt + 1 logger.warning( @@ -299,7 +315,8 @@ async def execute_with_retry( max_attempts=total_attempts, delay_s=round(delay, 3), ) - self.set_blocked(delay) + if status != 400: + self.set_blocked(delay) await asyncio.sleep(delay) assert last_exc is not None diff --git a/pyproject.toml b/pyproject.toml index 5f20d05f..2a4d3491 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "free-claude-code" -version = "1.2.41" +version = "1.2.42" description = "Middleware between Claude Code CLI (Anthropic API) and NVIDIA NIM" readme = "README.md" requires-python = ">=3.14.0" diff --git a/tests/providers/test_anthropic_messages_429_retry.py b/tests/providers/test_anthropic_messages_429_retry.py index 7b0fb0e0..ab379ca3 100644 --- a/tests/providers/test_anthropic_messages_429_retry.py +++ b/tests/providers/test_anthropic_messages_429_retry.py @@ -207,8 +207,8 @@ async def _slot(): @pytest.mark.asyncio -async def test_non_retryable_4xx_http_error_not_retried(provider_config): - """HTTP 400 from upstream is not retried; single send (passthrough limiter).""" +async def test_transient_400_is_retried_then_exhausts(provider_config): + """HTTP 400 from upstream IS now retried (transient 400 support); 5 send calls then SSE error.""" GlobalRateLimiter.reset_instance() try: @@ -218,11 +218,14 @@ async def _slot(): with patch("providers.anthropic_messages.GlobalRateLimiter") as mock_gl: instance = mock_gl.get_scoped_instance.return_value - - async def _passthrough(fn, *args, **kwargs): - return await fn(*args, **kwargs) - - instance.execute_with_retry = AsyncMock(side_effect=_passthrough) + real = GlobalRateLimiter( + rate_limit=100, + rate_window=60, + max_concurrency=5, + ) + instance.wait_if_blocked = real.wait_if_blocked + instance.execute_with_retry = real.execute_with_retry + instance.set_blocked = real.set_blocked instance.concurrency_slot.side_effect = _slot provider = NativeProvider(provider_config) @@ -239,10 +242,12 @@ async def _passthrough(fn, *args, **kwargs): new_callable=AsyncMock, return_value=err, ) as mock_send, + patch("asyncio.sleep", new_callable=AsyncMock), ): events = [e async for e in provider.stream_response(req)] - mock_send.assert_awaited_once() + # 1 initial + 4 retries = 5 calls (400 is now retryable with default max_retries=4) + assert mock_send.await_count == 5 assert err.is_closed assert_canonical_stream_error_envelope( events, user_message_substr="Invalid request sent to provider" diff --git a/tests/providers/test_provider_rate_limit.py b/tests/providers/test_provider_rate_limit.py index fd40230c..9a1f9d34 100644 --- a/tests/providers/test_provider_rate_limit.py +++ b/tests/providers/test_provider_rate_limit.py @@ -375,8 +375,8 @@ async def always_fail(): ) @pytest.mark.asyncio - async def test_execute_with_retry_httpx_400_raises_immediately(self): - """Non-retryable 4xx is not wrapped by execute_with_retry loop.""" + async def test_execute_with_retry_400_retried_then_exhausts(self): + """HTTP 400 is now retried by execute_with_retry (transient 400 support).""" import httpx from httpx import Request, Response @@ -385,7 +385,7 @@ async def test_execute_with_retry_httpx_400_raises_immediately(self): call_count = 0 - async def bad_request(): + async def always_400(): nonlocal call_count call_count += 1 r = Response(400, request=Request("POST", "http://x"), text="bad request") @@ -393,14 +393,75 @@ async def bad_request(): with pytest.raises(httpx.HTTPStatusError): await limiter.execute_with_retry( - bad_request, + always_400, max_retries=2, base_delay=0.01, max_delay=0.1, jitter=0, ) - assert call_count == 1 + # 1 initial + 2 retries = 3 calls total (400 is now retryable) + assert call_count == 3 + + @pytest.mark.asyncio + async def test_execute_with_retry_400_then_200_recovers(self): + """Transient HTTP 400 then success: retry recovers.""" + import httpx + from httpx import Request, Response + + GlobalRateLimiter.reset_instance() + limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60) + + call_count = 0 + + async def fail_then_ok(): + nonlocal call_count + call_count += 1 + if call_count == 1: + r = Response(400, request=Request("POST", "http://x"), text="bad request") + raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r) + return "ok" + + result = await limiter.execute_with_retry( + fail_then_ok, + max_retries=2, + base_delay=0.01, + max_delay=0.1, + jitter=0, + ) + assert result == "ok" + assert call_count == 2 + + @pytest.mark.asyncio + async def test_execute_with_retry_openai_400_retried_then_exhausts(self): + """OpenAI 400 errors are also retried.""" + import openai + from httpx import Request, Response + + GlobalRateLimiter.reset_instance() + limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60) + + call_count = 0 + + async def always_400(): + nonlocal call_count + call_count += 1 + raise openai.BadRequestError( + "bad request", + response=Response(400, request=Request("POST", "http://x")), + body={}, + ) + + with pytest.raises(openai.BadRequestError): + await limiter.execute_with_retry( + always_400, + max_retries=2, + base_delay=0.01, + max_delay=0.1, + jitter=0, + ) + + assert call_count == 3 @pytest.mark.asyncio async def test_max_concurrency_zero_raises(self): diff --git a/uv.lock b/uv.lock index c79d02b4..42ede63b 100644 --- a/uv.lock +++ b/uv.lock @@ -561,7 +561,7 @@ wheels = [ [[package]] name = "free-claude-code" -version = "1.2.41" +version = "1.2.42" source = { editable = "." } dependencies = [ { name = "aiohttp" },