From d69feebeb590b24f919b684a6dd899a7ef0d730e Mon Sep 17 00:00:00 2001 From: Tommy McCormick Date: Wed, 17 Jun 2026 21:11:50 -0400 Subject: [PATCH] fix(atenet): retry transient upstream resets when routing to resumed actors When a request hits the router for a suspended actor, the ext_proc filter resumes the actor and rewrites :authority to the actor pod's IP:80, which the dynamic_forward_proxy cluster then connects to. In the brief window after resume returns but before the restored workload is accepting connections (or when a pooled connection to a just-suspended actor has gone stale), Envoy's upstream connection is reset before response headers. The actor route had no retry policy, so each such reset became an immediate 503 "upstream connect error or disconnect/reset before headers. reset reason: connection termination". Add a retry policy to the actor route (retry_on "reset,connect-failure", 5 retries with 50ms-1s backoff) so these transient failures are retried once the listener is ready. A retry policy alone is not enough: every actor is routed through the single dynamic_forward_proxy cluster, whose retry circuit breaker defaults to only 3 concurrent retries cluster-wide, so a burst of concurrent requests to a just-resumed actor overflows it and the excess fails with 503 (UO) instead of retrying. Rather than inflate the static max_retries (which exists to cap retry amplification during an outage), configure a retry budget on the cluster: budget_percent 20% (Envoy's default) scales the allowed retries with load, with min_retry_concurrency 20 as a low-traffic floor above the default of 3. Other circuit breakers keep their defaults. Reproduced on a kind cluster with the multi-template demo: concurrent requests to an actor immediately after suspend produced intermittent 503s with envoy response flags UC (connection termination) and UO (overflow). With the fix deployed (retry policy and retry budget verified live in the envoy config dump) the same protocol produced 0 failures across 1600+ requests. Fixes #218. --- cmd/atenet/internal/router/xds.go | 33 +++++++++++++++++++++++++- cmd/atenet/internal/router/xds_test.go | 23 ++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/cmd/atenet/internal/router/xds.go b/cmd/atenet/internal/router/xds.go index 964fc5e92..7eeabc9aa 100644 --- a/cmd/atenet/internal/router/xds.go +++ b/cmd/atenet/internal/router/xds.go @@ -50,6 +50,7 @@ import ( endpointgrpc "github.com/envoyproxy/go-control-plane/envoy/service/endpoint/v3" listenergrpc "github.com/envoyproxy/go-control-plane/envoy/service/listener/v3" routegrpc "github.com/envoyproxy/go-control-plane/envoy/service/route/v3" + typev3 "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/envoyproxy/go-control-plane/pkg/cache/types" cachev3 "github.com/envoyproxy/go-control-plane/pkg/cache/v3" resourcev3 "github.com/envoyproxy/go-control-plane/pkg/resource/v3" @@ -145,7 +146,6 @@ func (x *XdsServer) UpdateSnapshot() error { resourcev3.RouteType: routes, resourcev3.ListenerType: listeners, }) - if err != nil { return fmt.Errorf("failed to build xDS Snapshot: %w", err) } @@ -264,6 +264,21 @@ func (x *XdsServer) buildDynamicForwardProxyCluster() *clusterv3.Cluster { TypedConfig: clusterConfigAny, }, }, + CircuitBreakers: &clusterv3.CircuitBreakers{ + Thresholds: []*clusterv3.CircuitBreakers_Thresholds{ + { + Priority: corev3.RoutingPriority_DEFAULT, + RetryBudget: &clusterv3.CircuitBreakers_Thresholds_RetryBudget{ + // Set to Envoy's default to scale retries with load. + BudgetPercent: &typev3.Percent{Value: 20.0}, + // Floor for low-traffic periods, raised above the default + // of 3 so a burst of simultaneous first-request resumes + // is not throttled when overall load is low. + MinRetryConcurrency: wrapperspb.UInt32(20), + }, + }, + }, + }, } } @@ -287,6 +302,22 @@ func (x *XdsServer) buildRoutes() *routev3.RouteConfiguration { Cluster: "dynamic_forward_proxy_cluster", }, Timeout: durationpb.New(10 * time.Second), + // A request can arrive at the router in the brief window + // after an actor is resumed but before its workload is + // accepting connections, or while a pooled upstream + // connection to a just-suspended actor is going stale. + // Either case surfaces as an upstream reset/connection + // failure before response headers. Retry these transient + // failures (with backoff) so the request lands once the + // listener is ready instead of returning a 503. + RetryPolicy: &routev3.RetryPolicy{ + RetryOn: "reset,connect-failure", + NumRetries: wrapperspb.UInt32(5), + RetryBackOff: &routev3.RetryPolicy_RetryBackOff{ + BaseInterval: durationpb.New(50 * time.Millisecond), + MaxInterval: durationpb.New(1 * time.Second), + }, + }, }, }, }, diff --git a/cmd/atenet/internal/router/xds_test.go b/cmd/atenet/internal/router/xds_test.go index 92e347648..a9a4cd6b9 100644 --- a/cmd/atenet/internal/router/xds_test.go +++ b/cmd/atenet/internal/router/xds_test.go @@ -83,6 +83,17 @@ func TestXdsServer_UpdateSnapshot(t *testing.T) { if c.GetName() != "dynamic_forward_proxy_cluster" { t.Errorf("Expected 'dynamic_forward_proxy_cluster', got %s", c.GetName()) } + + // A retry budget must replace Envoy's static default of 3 concurrent + // retries so simultaneous first-request resumes are not throttled. + thresholds := c.GetCircuitBreakers().GetThresholds() + if len(thresholds) != 1 { + t.Fatalf("Expected 1 circuit-breaker threshold, got %d", len(thresholds)) + } + budget := thresholds[0].GetRetryBudget() + if budget == nil { + t.Fatal("Expected a retry budget on the dynamic_forward_proxy cluster, got none") + } } // Verify Virtual Hosts generated inside Route configuration @@ -116,6 +127,18 @@ func TestXdsServer_UpdateSnapshot(t *testing.T) { if fallbackRoute.GetMatch().GetPrefix() != "/" { t.Errorf("Expected path mapping prefix '/', got '%s'", fallbackRoute.GetMatch().GetPrefix()) } + + // Transient upstream resets/connection failures during actor resume must + // be retried rather than surfaced as 503s. + retry := fallbackRoute.GetRoute().GetRetryPolicy() + if retry == nil { + t.Fatal("Expected a retry policy on the actor route, got none") + } + for _, on := range []string{"reset", "connect-failure"} { + if !strings.Contains(retry.GetRetryOn(), on) { + t.Errorf("Expected retry_on to include %q, got %q", on, retry.GetRetryOn()) + } + } } // Verify listeners generated