diff --git a/cmd/atenet/internal/router/xds.go b/cmd/atenet/internal/router/xds.go index 964fc5e9..7eeabc9a 100644 --- a/cmd/atenet/internal/router/xds.go +++ b/cmd/atenet/internal/router/xds.go @@ -50,6 +50,7 @@ import ( endpointgrpc "github.com/envoyproxy/go-control-plane/envoy/service/endpoint/v3" listenergrpc "github.com/envoyproxy/go-control-plane/envoy/service/listener/v3" routegrpc "github.com/envoyproxy/go-control-plane/envoy/service/route/v3" + typev3 "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/envoyproxy/go-control-plane/pkg/cache/types" cachev3 "github.com/envoyproxy/go-control-plane/pkg/cache/v3" resourcev3 "github.com/envoyproxy/go-control-plane/pkg/resource/v3" @@ -145,7 +146,6 @@ func (x *XdsServer) UpdateSnapshot() error { resourcev3.RouteType: routes, resourcev3.ListenerType: listeners, }) - if err != nil { return fmt.Errorf("failed to build xDS Snapshot: %w", err) } @@ -264,6 +264,21 @@ func (x *XdsServer) buildDynamicForwardProxyCluster() *clusterv3.Cluster { TypedConfig: clusterConfigAny, }, }, + CircuitBreakers: &clusterv3.CircuitBreakers{ + Thresholds: []*clusterv3.CircuitBreakers_Thresholds{ + { + Priority: corev3.RoutingPriority_DEFAULT, + RetryBudget: &clusterv3.CircuitBreakers_Thresholds_RetryBudget{ + // Set to Envoy's default to scale retries with load. + BudgetPercent: &typev3.Percent{Value: 20.0}, + // Floor for low-traffic periods, raised above the default + // of 3 so a burst of simultaneous first-request resumes + // is not throttled when overall load is low. + MinRetryConcurrency: wrapperspb.UInt32(20), + }, + }, + }, + }, } } @@ -287,6 +302,22 @@ func (x *XdsServer) buildRoutes() *routev3.RouteConfiguration { Cluster: "dynamic_forward_proxy_cluster", }, Timeout: durationpb.New(10 * time.Second), + // A request can arrive at the router in the brief window + // after an actor is resumed but before its workload is + // accepting connections, or while a pooled upstream + // connection to a just-suspended actor is going stale. + // Either case surfaces as an upstream reset/connection + // failure before response headers. Retry these transient + // failures (with backoff) so the request lands once the + // listener is ready instead of returning a 503. + RetryPolicy: &routev3.RetryPolicy{ + RetryOn: "reset,connect-failure", + NumRetries: wrapperspb.UInt32(5), + RetryBackOff: &routev3.RetryPolicy_RetryBackOff{ + BaseInterval: durationpb.New(50 * time.Millisecond), + MaxInterval: durationpb.New(1 * time.Second), + }, + }, }, }, }, diff --git a/cmd/atenet/internal/router/xds_test.go b/cmd/atenet/internal/router/xds_test.go index 92e34764..a9a4cd6b 100644 --- a/cmd/atenet/internal/router/xds_test.go +++ b/cmd/atenet/internal/router/xds_test.go @@ -83,6 +83,17 @@ func TestXdsServer_UpdateSnapshot(t *testing.T) { if c.GetName() != "dynamic_forward_proxy_cluster" { t.Errorf("Expected 'dynamic_forward_proxy_cluster', got %s", c.GetName()) } + + // A retry budget must replace Envoy's static default of 3 concurrent + // retries so simultaneous first-request resumes are not throttled. + thresholds := c.GetCircuitBreakers().GetThresholds() + if len(thresholds) != 1 { + t.Fatalf("Expected 1 circuit-breaker threshold, got %d", len(thresholds)) + } + budget := thresholds[0].GetRetryBudget() + if budget == nil { + t.Fatal("Expected a retry budget on the dynamic_forward_proxy cluster, got none") + } } // Verify Virtual Hosts generated inside Route configuration @@ -116,6 +127,18 @@ func TestXdsServer_UpdateSnapshot(t *testing.T) { if fallbackRoute.GetMatch().GetPrefix() != "/" { t.Errorf("Expected path mapping prefix '/', got '%s'", fallbackRoute.GetMatch().GetPrefix()) } + + // Transient upstream resets/connection failures during actor resume must + // be retried rather than surfaced as 503s. + retry := fallbackRoute.GetRoute().GetRetryPolicy() + if retry == nil { + t.Fatal("Expected a retry policy on the actor route, got none") + } + for _, on := range []string{"reset", "connect-failure"} { + if !strings.Contains(retry.GetRetryOn(), on) { + t.Errorf("Expected retry_on to include %q, got %q", on, retry.GetRetryOn()) + } + } } // Verify listeners generated