Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions providers/rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,38 @@


def _upstream_http_retryable(code: int) -> bool:
"""True for rate limit / upstream server failures that should backoff-retry."""
"""True for rate limit / upstream server failures that should backoff-retry.

Does NOT include 400 — 400 retries skip set_blocked (see retryable_upstream_status).
"""
return code == 429 or 500 <= code <= 599


def retryable_upstream_status(exc: BaseException) -> int | None:
"""Return HTTP-like status codes that qualify for reactive backoff retries.
"""Return HTTP-like status codes that qualify for backoff retries.

``429`` plus any upstream ``5xx`` use the same exponential backoff and scoped
limiter blocking semantics as today's rate-limit path.
``429`` and upstream ``5xx`` use the same exponential backoff and scoped
limiter blocking semantics as today's rate-limit path. ``400`` is also
retried but does NOT trigger the global reactive block (per-request hiccup,
not upstream congestion).
"""
if isinstance(exc, openai.RateLimitError):
return 429
if isinstance(exc, httpx.HTTPStatusError):
status = exc.response.status_code
if _upstream_http_retryable(status):
return status
if status == 400:
return 400
return None
if isinstance(exc, openai.BadRequestError):
return 400
if isinstance(exc, openai.APIError):
status = getattr(exc, "status_code", None)
if isinstance(status, int) and 500 <= status <= 599:
return status
if status == 400:
return 400
return None
return None

Expand Down Expand Up @@ -237,7 +248,9 @@ async def execute_with_retry(

Waits for the proactive limiter before each attempt. On ``429`` (rate limit)
or upstream ``5xx`` server errors, applies exponential backoff with jitter
and sets the reactive block before retrying.
and sets the reactive block before retrying. HTTP 400 is also retried but
does NOT set the global reactive block (genuine bad requests should not
stall concurrent requests).

Args:
fn: Async callable to execute.
Expand Down Expand Up @@ -269,6 +282,8 @@ async def execute_with_retry(
"Rate limited (429)"
if status == 429
else f"Upstream server error ({status})"
if status >= 500
else f"Transient bad request ({status})"
)
last_exc = e
if attempt >= max_retries:
Expand All @@ -280,7 +295,8 @@ async def execute_with_retry(
)
break

delay = min(base_delay * (2**attempt), max_delay)
effective_base = 0.5 if status == 400 else base_delay
delay = min(effective_base * (2**attempt), max_delay)
delay += random.uniform(0, jitter)
attempt_no = attempt + 1
logger.warning(
Expand All @@ -299,7 +315,8 @@ async def execute_with_retry(
max_attempts=total_attempts,
delay_s=round(delay, 3),
)
self.set_blocked(delay)
if status != 400:
self.set_blocked(delay)
await asyncio.sleep(delay)

assert last_exc is not None
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "free-claude-code"
version = "1.2.41"
version = "1.2.42"
description = "Middleware between Claude Code CLI (Anthropic API) and NVIDIA NIM"
readme = "README.md"
requires-python = ">=3.14.0"
Expand Down
21 changes: 13 additions & 8 deletions tests/providers/test_anthropic_messages_429_retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ async def _slot():


@pytest.mark.asyncio
async def test_non_retryable_4xx_http_error_not_retried(provider_config):
"""HTTP 400 from upstream is not retried; single send (passthrough limiter)."""
async def test_transient_400_is_retried_then_exhausts(provider_config):
"""HTTP 400 from upstream IS now retried (transient 400 support); 5 send calls then SSE error."""
GlobalRateLimiter.reset_instance()
try:

Expand All @@ -218,11 +218,14 @@ async def _slot():

with patch("providers.anthropic_messages.GlobalRateLimiter") as mock_gl:
instance = mock_gl.get_scoped_instance.return_value

async def _passthrough(fn, *args, **kwargs):
return await fn(*args, **kwargs)

instance.execute_with_retry = AsyncMock(side_effect=_passthrough)
real = GlobalRateLimiter(
rate_limit=100,
rate_window=60,
max_concurrency=5,
)
instance.wait_if_blocked = real.wait_if_blocked
instance.execute_with_retry = real.execute_with_retry
instance.set_blocked = real.set_blocked
instance.concurrency_slot.side_effect = _slot

provider = NativeProvider(provider_config)
Expand All @@ -239,10 +242,12 @@ async def _passthrough(fn, *args, **kwargs):
new_callable=AsyncMock,
return_value=err,
) as mock_send,
patch("asyncio.sleep", new_callable=AsyncMock),
):
events = [e async for e in provider.stream_response(req)]

mock_send.assert_awaited_once()
# 1 initial + 4 retries = 5 calls (400 is now retryable with default max_retries=4)
assert mock_send.await_count == 5
assert err.is_closed
assert_canonical_stream_error_envelope(
events, user_message_substr="Invalid request sent to provider"
Expand Down
71 changes: 66 additions & 5 deletions tests/providers/test_provider_rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ async def always_fail():
)

@pytest.mark.asyncio
async def test_execute_with_retry_httpx_400_raises_immediately(self):
"""Non-retryable 4xx is not wrapped by execute_with_retry loop."""
async def test_execute_with_retry_400_retried_then_exhausts(self):
"""HTTP 400 is now retried by execute_with_retry (transient 400 support)."""
import httpx
from httpx import Request, Response

Expand All @@ -385,22 +385,83 @@ async def test_execute_with_retry_httpx_400_raises_immediately(self):

call_count = 0

async def bad_request():
async def always_400():
nonlocal call_count
call_count += 1
r = Response(400, request=Request("POST", "http://x"), text="bad request")
raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r)

with pytest.raises(httpx.HTTPStatusError):
await limiter.execute_with_retry(
bad_request,
always_400,
max_retries=2,
base_delay=0.01,
max_delay=0.1,
jitter=0,
)

assert call_count == 1
# 1 initial + 2 retries = 3 calls total (400 is now retryable)
assert call_count == 3

@pytest.mark.asyncio
async def test_execute_with_retry_400_then_200_recovers(self):
"""Transient HTTP 400 then success: retry recovers."""
import httpx
from httpx import Request, Response

GlobalRateLimiter.reset_instance()
limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60)

call_count = 0

async def fail_then_ok():
nonlocal call_count
call_count += 1
if call_count == 1:
r = Response(400, request=Request("POST", "http://x"), text="bad request")
raise httpx.HTTPStatusError("Bad Request", request=r.request, response=r)
return "ok"

result = await limiter.execute_with_retry(
fail_then_ok,
max_retries=2,
base_delay=0.01,
max_delay=0.1,
jitter=0,
)
assert result == "ok"
assert call_count == 2

@pytest.mark.asyncio
async def test_execute_with_retry_openai_400_retried_then_exhausts(self):
"""OpenAI 400 errors are also retried."""
import openai
from httpx import Request, Response

GlobalRateLimiter.reset_instance()
limiter = GlobalRateLimiter.get_instance(rate_limit=100, rate_window=60)

call_count = 0

async def always_400():
nonlocal call_count
call_count += 1
raise openai.BadRequestError(
"bad request",
response=Response(400, request=Request("POST", "http://x")),
body={},
)

with pytest.raises(openai.BadRequestError):
await limiter.execute_with_retry(
always_400,
max_retries=2,
base_delay=0.01,
max_delay=0.1,
jitter=0,
)

assert call_count == 3

@pytest.mark.asyncio
async def test_max_concurrency_zero_raises(self):
Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.