From cff0b86140599d80e08327c15484145565a8475f Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Thu, 14 May 2026 17:39:19 -0400 Subject: [PATCH 01/21] Add ParallelAsync for concurrent branch execution (DOTNET-8662) Adds parallel branch execution to the .NET Durable Execution SDK. ParallelAsync runs N branches concurrently with configurable concurrency limits and completion policies, returning an IBatchResult with per-branch status and error information. Per-branch checkpoint payloads are serialized via the ILambdaSerializer registered on ILambdaContext.Serializer (typically configured through LambdaBootstrapBuilder.Create(handler, serializer)), matching the StepAsync / RunInChildContextAsync pattern. There are no separate reflection / AOT-safe overload pairs: the AOT story is determined entirely by which serializer the user registers with the runtime. Public surface: - IDurableContext.ParallelAsync (2 overloads: Func[] vs DurableBranch[]) - DurableBranch record (Name + Func) - ParallelConfig (MaxConcurrency, CompletionConfig, NestingType) - CompletionConfig with factories AllSuccessful() / FirstSuccessful() / AllCompleted(); ToleratedFailureCount / ToleratedFailurePercentage (validated 0.0-1.0) - IBatchResult with All / Succeeded / Failed / Started accessors, GetResults, GetErrors, ThrowIfError, HasFailure, CompletionReason, count properties - IBatchItem with Index, Name, Status, Result, Error - BatchItemStatus { Succeeded, Failed, Started } - CompletionReason { AllCompleted, MinSuccessfulReached, FailureToleranceExceeded } - NestingType (Nested default; Flat throws NotSupportedException - reserved) - ParallelException (carries IBatchResult; future-subclassable) Internal: - ParallelOperation orchestrator dispatches branches with optional semaphore-bounded concurrency. Each branch runs as a ChildContextOperation with deterministic ID via OperationIdGenerator.CreateChild. - Branch failures aggregated as IBatchItem entries; orchestrator throws ParallelException only when CompletionConfig signals FailureToleranceExceeded. - Parent CONTEXT checkpoint records summary (CompletionReason + per-branch index/name/status); branch results live on per-branch CONTEXT checkpoints. - ExecutionState now thread-safe (lock around reads/writes of _operations, _visitedOperations, _isReplaying). Required for concurrent branch replay; affects all operations but no regressions. - ParallelOperation awaits Task.WhenAll(inFlight) before disposing the semaphore so cancellation/exception during dispatch lets in-flight branches settle cleanly. - Reuses OperationSubTypes.Parallel / OperationSubTypes.ParallelBranch from Wave 0. Adds 31 unit tests + 6 integration tests covering CompletionConfig matrix, MaxConcurrency, FirstSuccessful short-circuit, replay determinism, mixed-status replay, cancellation, and concurrency stress. Co-Authored-By: Claude Opus 4.7 (1M context) --- Docs/durable-execution-design.md | 9 +- .../BatchItemStatus.cs | 30 + .../CompletionConfig.cs | 75 ++ .../CompletionReason.cs | 29 + .../DurableBranch.cs | 13 + .../DurableContext.cs | 94 +- .../DurableExecutionException.cs | 33 + .../IBatchItem.cs | 38 + .../IBatchResult.cs | 90 ++ .../IDurableContext.cs | 40 + .../Internal/BatchItem.cs | 15 + .../Internal/BatchResult.cs | 80 ++ .../Internal/ExecutionState.cs | 154 ++- .../Internal/ParallelJsonContext.cs | 15 + .../Internal/ParallelOperation.cs | 637 ++++++++++ .../Internal/ParallelSummary.cs | 38 + .../NestingType.cs | 37 + .../Operation.cs | 6 + .../ParallelConfig.cs | 57 + .../ParallelFailureToleranceTest.cs | 70 ++ .../ParallelFirstSuccessfulTest.cs | 81 ++ .../ParallelHappyPathTest.cs | 72 ++ .../ParallelMaxConcurrencyTest.cs | 76 ++ .../ParallelPartialFailureTest.cs | 74 ++ .../ParallelReplayDeterminismTest.cs | 122 ++ .../Dockerfile | 7 + .../Function.cs | 60 + .../ParallelFailureToleranceFunction.csproj | 18 + .../Dockerfile | 7 + .../Function.cs | 79 ++ .../ParallelFirstSuccessfulFunction.csproj | 18 + .../ParallelHappyPathFunction/Dockerfile | 7 + .../ParallelHappyPathFunction/Function.cs | 40 + .../ParallelHappyPathFunction.csproj | 18 + .../ParallelMaxConcurrencyFunction/Dockerfile | 7 + .../Function.cs | 67 ++ .../ParallelMaxConcurrencyFunction.csproj | 18 + .../ParallelPartialFailureFunction/Dockerfile | 7 + .../Function.cs | 61 + .../ParallelPartialFailureFunction.csproj | 18 + .../Dockerfile | 7 + .../Function.cs | 57 + .../ParallelReplayDeterminismFunction.csproj | 18 + .../ParallelOperationTests.cs | 1037 +++++++++++++++++ 44 files changed, 3470 insertions(+), 66 deletions(-) create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs diff --git a/Docs/durable-execution-design.md b/Docs/durable-execution-design.md index 59ced6a15..33d4092a5 100644 --- a/Docs/durable-execution-design.md +++ b/Docs/durable-execution-design.md @@ -559,7 +559,7 @@ For better observability, you can name individual branches (matching the JS SDK ```csharp // Named branches for easier debugging and testing var results = await context.ParallelAsync( - new NamedBranch[] + new DurableBranch[] { new("fetch_user", async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId))), new("fetch_orders", async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId))), @@ -1405,6 +1405,13 @@ public class CompletionConfig { public int? MinSuccessful { get; set; } public int? ToleratedFailureCount { get; set; } + /// + /// Maximum tolerated failure ratio, expressed as a value in the range + /// 0.0 to 1.0 (inclusive). For example, 0.25 means + /// "tolerate up to 25% failures; fail when the failure ratio strictly + /// exceeds 25%". null = no ratio-based threshold. Validated by the + /// setter; out-of-range values throw . + /// public double? ToleratedFailurePercentage { get; set; } public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs new file mode 100644 index 000000000..e07aa4f4c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs @@ -0,0 +1,30 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Status of an individual item in a . +/// +/// +/// Mirrors the wire-state of the per-branch checkpoint at the moment the batch +/// resolved. Items that finished produce or +/// ; items still in flight when the batch's +/// short-circuits remain in . +/// +public enum BatchItemStatus +{ + /// + /// The branch ran to completion and produced a result. + /// + Succeeded, + + /// + /// The branch ran to completion and threw. + /// + Failed, + + /// + /// The branch was still in flight when the batch's + /// resolved (e.g., returned + /// before this branch finished). + /// + Started +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs new file mode 100644 index 000000000..27a15d060 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs @@ -0,0 +1,75 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Defines completion criteria for parallel/map operations. +/// +/// +/// Construct via the static factories (, +/// , ) or set the +/// individual properties directly. Multiple criteria combine: the operation +/// resolves as soon as any criterion is met (success short-circuit) or violated +/// (failure short-circuit). +/// +public sealed class CompletionConfig +{ + private double? _toleratedFailurePercentage; + + /// + /// Minimum number of items required + /// before the operation resolves successfully. null = no minimum. + /// + public int? MinSuccessful { get; set; } + + /// + /// Maximum tolerated count. When the + /// failure count strictly exceeds this value, the operation resolves + /// with . + /// null = no count-based failure threshold. + /// + public int? ToleratedFailureCount { get; set; } + + /// + /// Maximum tolerated failure ratio, expressed as a value in the range + /// 0.0 to 1.0 (inclusive). For example, 0.25 means + /// "tolerate up to 25% failures; fail when the failure ratio strictly + /// exceeds 25%". null = no ratio-based failure threshold. + /// + /// + /// Thrown by the setter if the value is outside [0.0, 1.0]. + /// + public double? ToleratedFailurePercentage + { + get => _toleratedFailurePercentage; + set + { + if (value is { } v && (v < 0.0 || v > 1.0)) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "ToleratedFailurePercentage must be a ratio in [0.0, 1.0]."); + } + _toleratedFailurePercentage = value; + } + } + + /// + /// All items must succeed. Equivalent to + /// = 0. The default for + /// . + /// + public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; + + /// + /// Run every branch regardless of failures; surface failures per-item via + /// . Resolution does not auto-throw — + /// the caller can inspect the result and call + /// if they want strict-success + /// behavior. + /// + public static CompletionConfig AllCompleted() => new(); + + /// + /// Resolve as soon as one branch succeeds. Remaining in-flight branches are + /// reported as . + /// + public static CompletionConfig FirstSuccessful() => new() { MinSuccessful = 1 }; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs new file mode 100644 index 000000000..ed40a1fc8 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs @@ -0,0 +1,29 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Why a batch operation ( +/// or future Map) resolved. +/// +public enum CompletionReason +{ + /// + /// Every branch finished — no short-circuit + /// was triggered. Branches may be a mix of + /// and . + /// + AllCompleted, + + /// + /// branches succeeded; remaining + /// branches were left in . + /// + MinSuccessfulReached, + + /// + /// or + /// was exceeded. + /// The batch is considered failed and surfaces a + /// when awaited. + /// + FailureToleranceExceeded +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs new file mode 100644 index 000000000..c6e1cb6f0 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs @@ -0,0 +1,13 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// A named branch for +/// . +/// Names appear in execution traces and on the wire OperationUpdate.Name +/// field, and surface on . +/// +/// The branch's result type. +/// Human-readable branch name. Required. +/// The user function executed inside the branch's +/// child context. +public sealed record DurableBranch(string Name, Func> Func); diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs index ee5b1d1e6..8f360d02a 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs @@ -144,16 +144,8 @@ private Task RunChildContext( var operationId = _idGenerator.NextId(); - // Capture this DurableContext's collaborators; the child shares state, - // termination, batcher, ARN, and Lambda context — but uses a child - // OperationIdGenerator so its operation IDs are deterministically - // namespaced under the parent op ID. - IDurableContext ChildFactory(string parentOpId) => new DurableContext( - _state, _terminationManager, _idGenerator.CreateChild(parentOpId), - _durableExecutionArn, LambdaContext, _batcher); - var op = new ChildContextOperation( - operationId, name, _idGenerator.ParentId, func, config, serializer, ChildFactory, + operationId, name, _idGenerator.ParentId, func, config, serializer, MakeChildFactory(), _state, _terminationManager, _durableExecutionArn, _batcher); return op.ExecuteAsync(cancellationToken); } @@ -178,6 +170,75 @@ private Task> RunCallback( return op.ExecuteAsync(cancellationToken); } + public Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default) + => RunParallel(WrapToDurableBranches(branches), name, config, cancellationToken); + + public Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default) + => RunParallel(branches, name, config, cancellationToken); + + private static IReadOnlyList> WrapToDurableBranches( + IReadOnlyList>> branches) + { + if (branches == null) throw new ArgumentNullException(nameof(branches)); + + var result = new DurableBranch[branches.Count]; + for (var i = 0; i < branches.Count; i++) + { + var func = branches[i]; + if (func == null) + throw new ArgumentException($"Branch at index {i} is null.", nameof(branches)); + // Default name is the index — surfaces in execution traces and on + // IBatchItem.Name. Users wanting custom names use the + // DurableBranch overload. + result[i] = new DurableBranch(i.ToString(System.Globalization.CultureInfo.InvariantCulture), func); + } + return result; + } + + private Task> RunParallel( + IReadOnlyList> branches, + string? name, + ParallelConfig? config, + CancellationToken cancellationToken) + { + if (branches == null) throw new ArgumentNullException(nameof(branches)); + for (var i = 0; i < branches.Count; i++) + { + if (branches[i] == null) + throw new ArgumentException($"Branch at index {i} is null.", nameof(branches)); + if (branches[i].Func == null) + throw new ArgumentException($"Branch at index {i} has a null Func.", nameof(branches)); + } + + var effectiveConfig = config ?? new ParallelConfig(); + if (effectiveConfig.NestingType == NestingType.Flat) + { + throw new NotSupportedException( + "NestingType.Flat is not yet supported in the .NET Durable Execution SDK. " + + "Use NestingType.Nested (the default) for now."); + } + + var serializer = LambdaContext.Serializer + ?? throw new InvalidOperationException( + "No ILambdaSerializer is registered on ILambdaContext.Serializer. " + + "Register a serializer via LambdaBootstrapBuilder.Create(handler, serializer) " + + "(or in tests, set TestLambdaContext.Serializer)."); + + var operationId = _idGenerator.NextId(); + var op = new Internal.ParallelOperation( + operationId, name, _idGenerator.ParentId, branches, effectiveConfig, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + public Task WaitForCallbackAsync( Func submitter, string? name = null, @@ -390,6 +451,21 @@ private Task RunInvoke( _state, _terminationManager, _durableExecutionArn, _batcher); return op.ExecuteAsync(cancellationToken); } + + /// + /// Builds the factory used by (and + /// each branch) to construct + /// the inner . The child shares state, + /// termination, batcher, ARN, and Lambda context — but uses a child + /// so its operation IDs are + /// deterministically namespaced under the parent op ID. + /// + private Func MakeChildFactory() + { + return parentOpId => new DurableContext( + _state, _terminationManager, _idGenerator.CreateChild(parentOpId), + _durableExecutionArn, LambdaContext, _batcher); + } } internal sealed class WaitForCallbackContext : IWaitForCallbackContext diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs index 7f8707966..1b65c86b3 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs @@ -98,3 +98,36 @@ public ChildContextException(string message) : base(message) { } /// Creates a wrapping an inner exception. public ChildContextException(string message, Exception innerException) : base(message, innerException) { } } + +/// +/// Thrown when a parallel operation resolves with +/// . The aggregate +/// is preserved on so callers +/// can inspect per-branch outcomes. +/// +/// +/// This is the base type for parallel failures. Subclasses may be added in +/// future releases (for example, a dedicated +/// ParallelFailureToleranceExceededException); catching +/// remains forward-compatible. +/// +public class ParallelException : DurableExecutionException +{ + /// + /// The aggregate result of the parallel operation. Type-erased — cast to + /// IBatchResult<T> if the per-branch result type is known. + /// + public IBatchResult? Result { get; init; } + + /// + /// Why the parallel operation resolved. + /// + public CompletionReason CompletionReason { get; init; } + + /// Creates an empty . + public ParallelException() { } + /// Creates a with the given message. + public ParallelException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public ParallelException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs new file mode 100644 index 000000000..62814fd62 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs @@ -0,0 +1,38 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// One item inside an — the outcome of a single +/// branch (parallel) or item (map). +/// +/// The branch/item result type. +public interface IBatchItem +{ + /// + /// Zero-based position in the original branches/items list. Stable across + /// replays. + /// + int Index { get; } + + /// + /// Optional human-readable name for this branch/item. + /// Surfaces on the wire OperationUpdate.Name field for observability. + /// + string? Name { get; } + + /// + /// Status of this item at the moment the batch resolved. + /// + BatchItemStatus Status { get; } + + /// + /// The branch/item result. Populated only when is + /// . + /// + T? Result { get; } + + /// + /// The branch/item failure. Populated only when is + /// . + /// + DurableExecutionException? Error { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs new file mode 100644 index 000000000..baa5139d6 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs @@ -0,0 +1,90 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Non-generic marker for . Used by +/// so callers can hold a reference to +/// the aggregate result without knowing the per-branch type at compile time. +/// +public interface IBatchResult +{ + /// + /// Why the batch resolved. + /// + CompletionReason CompletionReason { get; } + + /// True if any item is in . + bool HasFailure { get; } + + /// Number of items in . + int SuccessCount { get; } + + /// Number of items in . + int FailureCount { get; } + + /// Number of items in . + int StartedCount { get; } + + /// Total number of items. + int TotalCount { get; } +} + +/// +/// Result of a parallel (and future map) operation. Aggregates the per-branch +/// outcomes, completion bookkeeping, and convenience accessors. +/// +/// The per-branch/per-item result type. +/// +/// The result is reconstructed from per-branch checkpoints — the aggregate is +/// never serialized as a single blob in user T. Per-branch results live on +/// ParallelBranch child-context checkpoints; this type assembles them. +/// +public interface IBatchResult : IBatchResult +{ + /// + /// All items, in original index order. + /// + IReadOnlyList> All { get; } + + /// + /// Items whose is + /// , in original index order. + /// + IReadOnlyList> Succeeded { get; } + + /// + /// Items whose is + /// , in original index order. + /// + IReadOnlyList> Failed { get; } + + /// + /// Items still in flight when the batch resolved (a + /// short-circuit fired before they finished), + /// in original index order. + /// + IReadOnlyList> Started { get; } + + /// + /// Returns the results of every successful item, in original index order. + /// + /// + /// Items in or are skipped — this + /// method never throws on partial-failure batches. Use + /// if you want a strict-success accessor. + /// + IReadOnlyList GetResults(); + + /// + /// Returns the errors for every failed item, in original index order. + /// + IReadOnlyList GetErrors(); + + /// + /// Throws the first failed item's if any + /// item failed; no-op otherwise. + /// + /// + /// The first failed item's error. + /// + void ThrowIfError(); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs index 5904f84e4..bf4916fd9 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs @@ -177,6 +177,46 @@ Task InvokeAsync( string? name = null, InvokeConfig? config = null, CancellationToken cancellationToken = default); + + /// + /// Execute multiple branches concurrently. Each branch runs inside its own + /// child context; per-branch results are aggregated into an + /// . Branches are dispatched up to + /// ; the aggregate resolves + /// according to . + /// + /// + /// On per-branch failure (a branch's user function throws), the failure is + /// captured on the corresponding instead of + /// aborting the parallel. The parallel only throws + /// when + /// criteria are violated. Use + /// for explicit strict-success + /// semantics. Per-branch results are serialized to checkpoints using the + /// registered on + /// (typically configured via + /// LambdaBootstrapBuilder.Create(handler, serializer)). + /// + Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple named branches concurrently. Names appear in execution + /// traces and on . + /// + /// + /// Per-branch results are serialized to checkpoints using the + /// registered on + /// . + /// + Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); } /// diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs new file mode 100644 index 000000000..5c9dda77c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs @@ -0,0 +1,15 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default implementation produced by +/// when assembling the +/// . +/// +internal sealed class BatchItem : IBatchItem +{ + public required int Index { get; init; } + public required string? Name { get; init; } + public required BatchItemStatus Status { get; init; } + public T? Result { get; init; } + public DurableExecutionException? Error { get; init; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs new file mode 100644 index 000000000..362303a0e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs @@ -0,0 +1,80 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default implementation. Computes derived views +/// ( / / ) +/// eagerly so consumers don't pay for re-filtering on every access. +/// +internal sealed class BatchResult : IBatchResult +{ + public BatchResult(IReadOnlyList> all, CompletionReason completionReason) + { + All = all; + CompletionReason = completionReason; + + var succeeded = new List>(); + var failed = new List>(); + var started = new List>(); + + foreach (var item in all) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded.Add(item); break; + case BatchItemStatus.Failed: failed.Add(item); break; + case BatchItemStatus.Started: started.Add(item); break; + } + } + + Succeeded = succeeded; + Failed = failed; + Started = started; + } + + public IReadOnlyList> All { get; } + public IReadOnlyList> Succeeded { get; } + public IReadOnlyList> Failed { get; } + public IReadOnlyList> Started { get; } + public CompletionReason CompletionReason { get; } + + public bool HasFailure => Failed.Count > 0; + + public int SuccessCount => Succeeded.Count; + public int FailureCount => Failed.Count; + public int StartedCount => Started.Count; + public int TotalCount => All.Count; + + public IReadOnlyList GetResults() + { + var list = new List(Succeeded.Count); + foreach (var item in Succeeded) + { + // Result is non-null on success items by construction; the BCL-typed + // index is preserved by walking Succeeded (already in original order). + list.Add(item.Result!); + } + return list; + } + + public IReadOnlyList GetErrors() + { + var list = new List(Failed.Count); + foreach (var item in Failed) + { + // Error is non-null on failure items by construction. + list.Add(item.Error!); + } + return list; + } + + public void ThrowIfError() + { + foreach (var item in All) + { + if (item.Status == BatchItemStatus.Failed && item.Error != null) + { + throw item.Error; + } + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs index 989749d9b..7ff404675 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs @@ -1,8 +1,6 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -using System.Collections.Concurrent; - namespace Amazon.Lambda.DurableExecution.Internal; /// @@ -23,54 +21,74 @@ namespace Amazon.Lambda.DurableExecution.Internal; /// for the rest of the invocation. /// /// -/// is invoked from the 's -/// background worker (via the onNewOperations hook) while the workflow thread -/// concurrently reads via / — -/// e.g. the fire-and-forget StepOperation path where the workflow is not -/// awaiting the flush. _operations is therefore a . -/// The replay-tracking fields (_visitedOperations, _isReplaying, -/// _remainingReplayOps) are touched only on the workflow thread. +/// Thread safety: two paths reach this type concurrently. (1) The +/// background worker invokes +/// (via the onNewOperations hook) while the +/// workflow thread reads via / — +/// e.g. the fire-and-forget StepOperation path. (2) +/// dispatches N branches concurrently, each +/// running its own , so +/// , , +/// , and the +/// getter are reachable from multiple threads at once. +/// All read/write access to _operations, _visitedOperations, +/// _isReplaying and _remainingReplayOps is therefore guarded by a +/// single private lock. Every guarded path is an O(1) dictionary lookup, set +/// insert, or short iteration, so contention stays brief; we use a plain +/// lock rather than because +/// none of the guarded code paths are async, and rather than +/// ConcurrentDictionary because performs +/// a compound add-then-scan. /// /// internal sealed class ExecutionState { - private readonly ConcurrentDictionary _operations = new(); + private readonly object _lock = new(); + private readonly Dictionary _operations = new(); private readonly HashSet _visitedOperations = new(); private bool _isReplaying; private int _remainingReplayOps; - public int CheckpointedOperationCount => _operations.Count; + public int CheckpointedOperationCount + { + get { lock (_lock) return _operations.Count; } + } /// /// True when the workflow is re-deriving prior operations from checkpointed /// state. False when running fresh (not-yet-checkpointed) code. /// - public bool IsReplaying => _isReplaying; + public bool IsReplaying + { + get { lock (_lock) return _isReplaying; } + } public void LoadFromCheckpoint(InitialExecutionState? initialState) { - if (initialState?.Operations != null) + lock (_lock) { - AddOperations(initialState.Operations); + if (initialState?.Operations != null) + { + AddOperationsLocked(initialState.Operations); + } + + // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, + // CANCELLED, STOPPED) we need to re-derive before resuming live work. + // The service-side EXECUTION op (input payload bookkeeping) is always + // present and doesn't count. If the only ops are in-progress + // (READY/PENDING/STARTED), there's nothing to re-derive — the next + // user call IS the next thing to run — so IsReplaying starts false. + var (_, terminalCount) = ScanReplayableLocked(); + _remainingReplayOps = terminalCount; + _isReplaying = terminalCount > 0; } - - // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, - // CANCELLED, STOPPED) we need to re-derive before resuming live work. - // The service-side EXECUTION op (input payload bookkeeping) is always - // present and doesn't count. If the only ops are in-progress - // (READY/PENDING/STARTED), there's nothing to re-derive — the next - // user call IS the next thing to run — so IsReplaying starts false. - var (_, terminalCount) = ScanReplayable(); - _remainingReplayOps = terminalCount; - _isReplaying = terminalCount > 0; } public void AddOperations(IEnumerable operations) { - foreach (var op in operations) + lock (_lock) { - if (op.Id == null) continue; - _operations[op.Id] = op; + AddOperationsLocked(operations); } } @@ -81,11 +99,20 @@ public void AddOperations(IEnumerable operations) /// public Operation? GetOperation(string operationId) { - _operations.TryGetValue(operationId, out var op); - return op; + lock (_lock) + { + _operations.TryGetValue(operationId, out var op); + return op; + } } - public bool HasOperation(string operationId) => _operations.ContainsKey(operationId); + public bool HasOperation(string operationId) + { + lock (_lock) + { + return _operations.ContainsKey(operationId); + } + } /// /// Records that the workflow has reached . @@ -96,43 +123,58 @@ public void AddOperations(IEnumerable operations) /// public void TrackReplay(string operationId) { - if (!_isReplaying) return; - if (!_visitedOperations.Add(operationId)) return; - if (!_operations.TryGetValue(operationId, out var op)) return; - if (op.Type == OperationTypes.Execution) return; - if (!IsTerminalStatus(op.Status)) return; - - if (--_remainingReplayOps <= 0) - _isReplaying = false; + lock (_lock) + { + if (!_isReplaying) return; + if (!_visitedOperations.Add(operationId)) return; + if (!_operations.TryGetValue(operationId, out var op)) return; + if (op.Type == OperationTypes.Execution) return; + if (!IsTerminalStatus(op.Status)) return; + + if (--_remainingReplayOps <= 0) + _isReplaying = false; + } } public void ValidateReplayConsistency(string operationId, string expectedType, string? expectedName) { - // Independent of IsReplaying: as long as a checkpoint record exists - // for this id, its type/name must match what user code is asking for. - // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), - // IsReplaying is false but the records still exist and code drift can - // still produce a mismatch. - if (!_operations.TryGetValue(operationId, out var op)) return; - - if (op.Type != null && op.Type != expectedType) + lock (_lock) { - throw new NonDeterministicExecutionException( - $"Non-deterministic execution detected for operation '{operationId}': " + - $"expected type '{expectedType}' but found '{op.Type}' from a previous invocation. " + - $"Code must not change the order or type of durable operations between deployments."); + // Independent of IsReplaying: as long as a checkpoint record exists + // for this id, its type/name must match what user code is asking for. + // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), + // IsReplaying is false but the records still exist and code drift can + // still produce a mismatch. + if (!_operations.TryGetValue(operationId, out var op)) return; + + if (op.Type != null && op.Type != expectedType) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected type '{expectedType}' but found '{op.Type}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } + + if (expectedName != null && op.Name != null && op.Name != expectedName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected name '{expectedName}' but found '{op.Name}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } } + } - if (expectedName != null && op.Name != null && op.Name != expectedName) + private void AddOperationsLocked(IEnumerable operations) + { + foreach (var op in operations) { - throw new NonDeterministicExecutionException( - $"Non-deterministic execution detected for operation '{operationId}': " + - $"expected name '{expectedName}' but found '{op.Name}' from a previous invocation. " + - $"Code must not change the order or type of durable operations between deployments."); + if (op.Id == null) continue; + _operations[op.Id] = op; } } - private (bool HasReplayable, int TerminalCount) ScanReplayable() + private (bool HasReplayable, int TerminalCount) ScanReplayableLocked() { var has = false; var count = 0; diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs new file mode 100644 index 000000000..9b830a59a --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// AOT-friendly for the internal +/// payload stored on a parallel parent's CONTEXT +/// checkpoint. Only this internal type — never user T — flows through here, so +/// the source-generated metadata is sufficient. +/// +[JsonSerializable(typeof(ParallelSummary))] +[JsonSerializable(typeof(ParallelBranchSummary))] +internal sealed partial class ParallelJsonContext : JsonSerializerContext +{ +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs new file mode 100644 index 000000000..f81d0d19b --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -0,0 +1,637 @@ +using System.IO; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Core; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable parallel operation. Runs N user-supplied branches concurrently +/// (each as a ) under a shared +/// and concurrency limit, persisting the +/// aggregate result so subsequent invocations replay it without re-executing. +/// +/// +/// Replay branches — example: await ctx.ParallelAsync(funcs, name: "fetch") +/// +/// Fresh: no prior state → sync-flush parent CONTEXT START → +/// dispatch branches respecting MaxConcurrency → wait for in-flight to +/// complete after CompletionConfig short-circuit → emit parent CONTEXT +/// SUCCEED with summary payload (). +/// SUCCEEDED: parent payload supplies the snapshot of per- +/// branch statuses + completion reason; per-branch results are +/// deserialised from the children's own CONTEXT checkpoints. +/// FAILED: same reconstruction; throws +/// carrying the rebuilt +/// . +/// STARTED / PENDING: re-execute (children replay from +/// their own checkpoints). +/// +/// Per-branch errors do NOT abort the parallel directly — the orchestrator +/// catches each branch's , records it as a +/// failed , and consults the +/// after every completion. Only when the +/// completion config marks the run as +/// does the parallel +/// throw. +/// +internal sealed class ParallelOperation : DurableOperation> +{ + private readonly IReadOnlyList> _branches; + private readonly ParallelConfig _config; + private readonly ILambdaSerializer _serializer; + private readonly Func _childContextFactory; + + public ParallelOperation( + string operationId, + string? name, + string? parentId, + IReadOnlyList> branches, + ParallelConfig config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _branches = branches; + _config = config; + _serializer = serializer; + _childContextFactory = childContextFactory; + } + + protected override string OperationType => OperationTypes.Context; + + protected override async Task> StartAsync(CancellationToken cancellationToken) + { + // Sync-flush parent CONTEXT START. Mirrors ChildContextOperation: if a + // branch suspends (e.g., a Wait inside a branch), the service needs to + // know the parallel parent existed. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = "START", + SubType = OperationSubTypes.Parallel, + Name = Name + }, cancellationToken); + + return await ExecuteBranchesAsync(cancellationToken); + } + + protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); + + case OperationStatuses.Failed: + // Reconstruct so the caller (and ParallelException.Result) sees + // the per-branch outcomes; then throw. + var failed = ReconstructFromCheckpoints(existing, throwOnFailure: false); + throw BuildParallelException(failed); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Re-run: branches replay from their own checkpoints. + return ExecuteBranchesAsync(cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"Parallel operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private async Task> ExecuteBranchesAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var branchCount = _branches.Count; + var slots = new BranchOutcome[branchCount]; + var dispatched = new bool[branchCount]; + + var maxConcurrency = _config.MaxConcurrency ?? branchCount; + // Optimisation: when MaxConcurrency >= branchCount, skip the semaphore + // entirely. Behaviour is identical, allocations are lower. + var semaphore = (maxConcurrency >= branchCount) ? null : new SemaphoreSlim(maxConcurrency, maxConcurrency); + + var minSuccessful = _config.CompletionConfig.MinSuccessful; + var toleratedFailureCount = _config.CompletionConfig.ToleratedFailureCount; + var toleratedFailurePercentage = _config.CompletionConfig.ToleratedFailurePercentage; + + var succeeded = 0; + var failed = 0; + + var inFlight = new List(branchCount); + + // Branches run with the parent's token so cooperative cancellation + // still propagates into user code, but we must NOT abandon already- + // dispatched branches while they're still writing checkpoints — that + // would diverge between the original run and replay. The dispatch + // loop and Task.WhenAll below therefore await every in-flight task + // even when cancellation fires; the semaphore is disposed only after + // those branches have settled (success, failure, or cooperative OCE). + try + { + try + { + for (var i = 0; i < branchCount; i++) + { + // Volatile reads pair with the Interlocked.Increment writes + // in the onComplete callback. Reads are non-atomic across + // the two counters: at worst we observe slightly stale + // values and dispatch one extra branch before the next + // completion forces a re-check. That's acceptable — the + // post-loop ComputeCompletionReason is the source of truth. + var succSnap = Volatile.Read(ref succeeded); + var failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, branchCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + break; + } + + if (semaphore != null) + { + await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false); + // Re-check after acquiring: the wait may have unblocked + // because earlier branches finished and short-circuited + // the operation. + succSnap = Volatile.Read(ref succeeded); + failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, branchCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + semaphore.Release(); + break; + } + } + + var index = i; + dispatched[index] = true; + inFlight.Add(RunBranchAsync(index, slots, semaphore, cancellationToken, + onComplete: outcome => + { + if (outcome.Status == BatchItemStatus.Succeeded) + Interlocked.Increment(ref succeeded); + else if (outcome.Status == BatchItemStatus.Failed) + Interlocked.Increment(ref failed); + })); + } + } + finally + { + // CRITICAL: wait for every dispatched branch — even on the + // exceptional path (parent-token cancellation mid-dispatch, or + // a synchronous throw out of the loop) — before the semaphore + // is disposed. Otherwise surviving branches' Release() calls + // hit ObjectDisposedException, the tasks become unobserved, + // and they keep writing checkpoints out from under us. + // + // We deliberately DO NOT cancel already-running branches when + // a short-circuit fires — orphan branches that continue + // writing checkpoints would diverge between the original run + // and replay. Letting them finish guarantees determinism: all + // dispatched branches end up Succeeded or Failed. Only + // un-dispatched branches surface as Started. + if (inFlight.Count > 0) + { + try + { + await Task.WhenAll(inFlight).ConfigureAwait(false); + } + catch + { + // Swallow here — Task.WhenAll only surfaces the first + // exception, but every branch task is now in a terminal + // state and we want to inspect each one individually + // below to decide whether to surface a workflow-level + // error. The Task objects themselves still carry their + // exceptions, so this swallow does not orphan them. + } + } + } + } + finally + { + semaphore?.Dispose(); + } + + // Surface any workflow-level exception (e.g. NonDeterministicExecutionException) + // raised inside a branch. RunBranchAsync re-throws DurableExecutionException + // (other than ChildContextException which is captured into the slot) so the + // task faults with that exception. Take the first such failure: these are + // structural errors, not "branch failed gracefully" outcomes. + foreach (var t in inFlight) + { + if (t.IsFaulted && t.Exception is { } agg) + { + foreach (var inner in agg.InnerExceptions) + { + if (inner is DurableExecutionException dex && inner is not ChildContextException) + { + throw dex; + } + } + } + } + + // Re-throw any pending parent-token cancellation now that branches + // have settled and the semaphore has been disposed cleanly. + cancellationToken.ThrowIfCancellationRequested(); + + // Build BatchItems for every branch in original order. + var items = new List>(branchCount); + for (var i = 0; i < branchCount; i++) + { + if (dispatched[i]) + { + var outcome = slots[i]; + items.Add(new BatchItem + { + Index = i, + Name = _branches[i].Name, + Status = outcome.Status, + Result = outcome.Status == BatchItemStatus.Succeeded ? outcome.Result : default, + Error = outcome.Status == BatchItemStatus.Failed ? outcome.Error : null + }); + } + else + { + items.Add(new BatchItem + { + Index = i, + Name = _branches[i].Name, + Status = BatchItemStatus.Started, + Result = default, + Error = null + }); + } + } + + var completionReason = ComputeCompletionReason(items, branchCount); + var result = new BatchResult(items, completionReason); + + await CheckpointParentResultAsync(result, completionReason, cancellationToken); + + if (completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildParallelException(result); + } + + return result; + } + + private async Task RunBranchAsync( + int index, + BranchOutcome[] slots, + SemaphoreSlim? semaphore, + CancellationToken cancellationToken, + Action onComplete) + { + try + { + var branch = _branches[index]; + var branchOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); + + var childOp = new ChildContextOperation( + branchOpId, + branch.Name, + OperationId, + branch.Func, + new ChildContextConfig { SubType = OperationSubTypes.ParallelBranch }, + _serializer, + _childContextFactory, + State, + Termination, + DurableExecutionArn, + Batcher); + + try + { + var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); + slots[index] = new BranchOutcome { Status = BatchItemStatus.Succeeded, Result = result }; + } + catch (ChildContextException ex) + { + slots[index] = new BranchOutcome { Status = BatchItemStatus.Failed, Error = ex }; + } + catch (DurableExecutionException) + { + // E.g. NonDeterministicExecutionException — these are not + // "branch failed gracefully" but workflow-level problems. + // Surface them: re-throw out of the parallel without writing + // a slot (the orchestrator's outer flow handles it). + throw; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // Parent-token cancellation: per cross-cutting decision Q10, + // OCE escapes unwrapped. Don't write a slot — Task.WhenAll + // observes this and the orchestrator re-throws after settling. + throw; + } + catch (OperationCanceledException ex) + { + // Branch-internal cancellation that is NOT tied to the parent + // token (e.g. the branch's own CancellationTokenSource fired). + // Treat it as a normal per-branch failure rather than killing + // the parallel as cancelled. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = OperationSubTypes.ParallelBranch, + ErrorType = ex.GetType().FullName + }; + slots[index] = new BranchOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + catch (Exception ex) + { + // Wrap unexpected exceptions as ChildContextException — they're + // per-branch failures from the user's POV. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = OperationSubTypes.ParallelBranch, + ErrorType = ex.GetType().FullName + }; + slots[index] = new BranchOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + + onComplete(slots[index]); + } + finally + { + // Defensive: with the new structure the semaphore is only disposed + // after Task.WhenAll(inFlight) has settled, so this Release should + // always succeed. ObjectDisposedException would indicate a bug + // elsewhere, but we tolerate it here so the task doesn't fault + // with a noise exception that masks the real one. + try + { + semaphore?.Release(); + } + catch (ObjectDisposedException) + { + } + } + } + + private static bool ShouldStopDispatching( + int succeeded, + int failed, + int totalBranches, + int? minSuccessful, + int? toleratedFailureCount, + double? toleratedFailurePercentage) + { + // Min-successful: short-circuit the moment we have enough wins. + if (minSuccessful is { } min && succeeded >= min) + return true; + + // Failure thresholds short-circuit on too many losses. + if (toleratedFailureCount is { } tfc && failed > tfc) + return true; + + if (toleratedFailurePercentage is { } tfp && totalBranches > 0) + { + var ratio = (double)failed / totalBranches; + if (ratio > tfp) return true; + } + + return false; + } + + private CompletionReason ComputeCompletionReason(IReadOnlyList> items, int totalCount) + { + var failed = 0; + var succeeded = 0; + var started = 0; + + foreach (var item in items) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded++; break; + case BatchItemStatus.Failed: failed++; break; + case BatchItemStatus.Started: started++; break; + } + } + + // Failure tolerance: only short-circuit-by-failure when at least one + // failure threshold is explicitly set. The factory CompletionConfig.AllSuccessful() + // sets ToleratedFailureCount = 0 to opt into fail-fast; an "empty" + // CompletionConfig (all properties null) is permissive. + if (_config.CompletionConfig.ToleratedFailureCount is { } tfc && failed > tfc) + return CompletionReason.FailureToleranceExceeded; + + if (_config.CompletionConfig.ToleratedFailurePercentage is { } tfp && totalCount > 0) + { + var ratio = (double)failed / totalCount; + if (ratio > tfp) return CompletionReason.FailureToleranceExceeded; + } + + // Min-successful satisfied (and we didn't run all branches): MinSuccessfulReached. + if (_config.CompletionConfig.MinSuccessful is { } min && succeeded >= min && started > 0) + { + return CompletionReason.MinSuccessfulReached; + } + + // Every dispatched branch finished one way or the other (or all-completed + // without any failure criteria). + return CompletionReason.AllCompleted; + } + + private async Task CheckpointParentResultAsync( + BatchResult result, + CompletionReason completionReason, + CancellationToken cancellationToken) + { + var summary = new ParallelSummary + { + CompletionReason = SerializeCompletionReason(completionReason), + Branches = new List(result.All.Count) + }; + for (var i = 0; i < result.All.Count; i++) + { + var item = result.All[i]; + summary.Branches.Add(new ParallelBranchSummary + { + Index = item.Index, + Name = item.Name, + Status = SerializeStatus(item.Status) + }); + } + + var payload = JsonSerializer.Serialize(summary, ParallelJsonContext.Default.ParallelSummary); + var failed = completionReason == CompletionReason.FailureToleranceExceeded; + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = failed ? "FAIL" : "SUCCEED", + SubType = OperationSubTypes.Parallel, + Name = Name, + Payload = failed ? null : payload, + Error = failed ? BuildAggregateError(result) : null + }, cancellationToken); + } + + private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwOnFailure) + { + var summary = ParseSummary(parent.ContextDetails?.Result); + + var items = new List>(_branches.Count); + for (var i = 0; i < _branches.Count; i++) + { + var branchOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{i + 1}"); + var branchOp = State.GetOperation(branchOpId); + var summaryEntry = summary?.Branches.FirstOrDefault(b => b.Index == i); + + BatchItemStatus status = summaryEntry != null + ? DeserializeStatus(summaryEntry.Status) + : InferStatusFromBranchOp(branchOp); + + T? branchResult = default; + DurableExecutionException? branchError = null; + + if (status == BatchItemStatus.Succeeded && branchOp?.ContextDetails?.Result != null) + { + branchResult = DeserializeBranchResult(branchOp.ContextDetails.Result); + } + else if (status == BatchItemStatus.Failed && branchOp?.ContextDetails?.Error != null) + { + var err = branchOp.ContextDetails.Error; + branchError = new ChildContextException(err.ErrorMessage ?? "Branch failed") + { + SubType = branchOp.SubType ?? OperationSubTypes.ParallelBranch, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + + items.Add(new BatchItem + { + Index = i, + Name = _branches[i].Name, + Status = status, + Result = branchResult, + Error = branchError + }); + } + + var completionReason = summary != null + ? DeserializeCompletionReason(summary.CompletionReason) + : ComputeCompletionReason(items, _branches.Count); + + var result = new BatchResult(items, completionReason); + + if (throwOnFailure && completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildParallelException(result); + } + + return result; + } + + private static BatchItemStatus InferStatusFromBranchOp(Operation? branchOp) + { + if (branchOp == null) return BatchItemStatus.Started; + return branchOp.Status switch + { + OperationStatuses.Succeeded => BatchItemStatus.Succeeded, + OperationStatuses.Failed => BatchItemStatus.Failed, + _ => BatchItemStatus.Started + }; + } + + private static ParallelException BuildParallelException(IBatchResult result) + { + return new ParallelException( + $"Parallel operation failed: failure tolerance exceeded ({result.FailureCount} of {result.TotalCount} branches failed).") + { + Result = result, + CompletionReason = result.CompletionReason + }; + } + + private static SdkErrorObject BuildAggregateError(IBatchResult result) + { + return new SdkErrorObject + { + ErrorType = typeof(ParallelException).FullName, + ErrorMessage = $"Parallel operation failed: {result.FailureCount} of {result.TotalCount} branches failed." + }; + } + + private static ParallelSummary? ParseSummary(string? payload) + { + if (string.IsNullOrEmpty(payload)) return null; + try + { + return JsonSerializer.Deserialize(payload, ParallelJsonContext.Default.ParallelSummary); + } + catch (JsonException) + { + // Tolerate older / corrupted payloads — fall back to inferring status + // from per-branch checkpoints. + return null; + } + } + + private static string SerializeStatus(BatchItemStatus status) => status switch + { + BatchItemStatus.Succeeded => "SUCCEEDED", + BatchItemStatus.Failed => "FAILED", + BatchItemStatus.Started => "STARTED", + _ => throw new ArgumentOutOfRangeException(nameof(status)) + }; + + private static BatchItemStatus DeserializeStatus(string? wire) => wire switch + { + "SUCCEEDED" => BatchItemStatus.Succeeded, + "FAILED" => BatchItemStatus.Failed, + "STARTED" => BatchItemStatus.Started, + _ => BatchItemStatus.Started + }; + + private static string SerializeCompletionReason(CompletionReason reason) => reason switch + { + CompletionReason.AllCompleted => "ALL_COMPLETED", + CompletionReason.MinSuccessfulReached => "MIN_SUCCESSFUL_REACHED", + CompletionReason.FailureToleranceExceeded => "FAILURE_TOLERANCE_EXCEEDED", + _ => throw new ArgumentOutOfRangeException(nameof(reason)) + }; + + private static CompletionReason DeserializeCompletionReason(string? wire) => wire switch + { + "ALL_COMPLETED" => CompletionReason.AllCompleted, + "MIN_SUCCESSFUL_REACHED" => CompletionReason.MinSuccessfulReached, + "FAILURE_TOLERANCE_EXCEEDED" => CompletionReason.FailureToleranceExceeded, + _ => CompletionReason.AllCompleted + }; + + private T DeserializeBranchResult(string serialized) + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return _serializer.Deserialize(ms); + } + + /// + /// Internal scratch space tracking each branch's outcome as it lands in + /// the executor; copied into the user-facing + /// once every dispatched branch has settled. + /// + private struct BranchOutcome + { + public BatchItemStatus Status; + public T? Result; + public DurableExecutionException? Error; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs new file mode 100644 index 000000000..ca75955b1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs @@ -0,0 +1,38 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Internal payload shape stored on a parallel parent's CONTEXT checkpoint +/// (as ContextDetails.Result) and reconstructed on replay. Carries the +/// completion reason and the per-branch index → status map so the +/// can be rebuilt without depending on user T +/// shape — per-branch results live on the children's own checkpoints. +/// +internal sealed class ParallelSummary +{ + [JsonPropertyName("CompletionReason")] + public string? CompletionReason { get; set; } + + [JsonPropertyName("Branches")] + public IList Branches { get; set; } = new List(); +} + +internal sealed class ParallelBranchSummary +{ + [JsonPropertyName("Index")] + public int Index { get; set; } + + [JsonPropertyName("Name")] + public string? Name { get; set; } + + [JsonPropertyName("Status")] + public string? Status { get; set; } + + // Note: there used to be an OperationId field here, but the replay path + // recomputes the deterministic branch ID from the parent ID + index + // (HashOperationId($"{parentOpId}-{i + 1}")). Carrying the ID on the + // wire was redundant and never read on replay; removed to reduce + // checkpoint size. If the hashing strategy ever changes we'll need a + // versioned recovery path, but that's a separate concern. +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs new file mode 100644 index 000000000..ee2c15c96 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs @@ -0,0 +1,37 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Controls how branches in a parallel/map operation are represented in the +/// checkpoint graph. +/// +/// +/// +/// is the default — each branch produces a full CONTEXT +/// operation visible in execution traces. +/// +/// +/// is reserved for a forthcoming optimisation that uses +/// virtual contexts to reduce checkpoint volume by ~30%. The .NET SDK currently +/// throws when is +/// supplied; the enum value is kept stable so opting in becomes non-breaking. +/// +/// +public enum NestingType +{ + /// + /// Each branch creates a full isolated CONTEXT operation. Higher + /// observability in execution traces but more checkpoint operations + /// (default). + /// + Nested, + + /// + /// Branches use virtual contexts sharing the parent. Reduces checkpoint + /// cost at the expense of less granular execution traces. + /// + /// + /// Not yet implemented in the .NET SDK; passing this value throws + /// . + /// + Flat +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs index 3b55cfa86..c81be9f3f 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs @@ -195,6 +195,12 @@ public static class OperationSubTypes /// Child-context sub-type. public const string Context = "Context"; + + /// Parallel parent sub-type. + public const string Parallel = "Parallel"; + + /// Parallel branch (per-branch child-context) sub-type. + public const string ParallelBranch = "ParallelBranch"; } /// diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs new file mode 100644 index 000000000..d40f09daf --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs @@ -0,0 +1,57 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for +/// . +/// +/// +/// Per-branch checkpoint payloads are serialized via the +/// registered on +/// (typically +/// configured via LambdaBootstrapBuilder.Create(handler, serializer)); +/// this config does not expose a serializer slot. +/// +public sealed class ParallelConfig +{ + private int? _maxConcurrency; + + /// + /// Maximum number of branches running concurrently. null (default) = + /// unlimited. Must be at least 1 when set. + /// + /// + /// Thrown by the setter if the value is less than or equal to 0. + /// + public int? MaxConcurrency + { + get => _maxConcurrency; + set + { + if (value is { } v && v <= 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MaxConcurrency must be at least 1, or null for unlimited."); + } + _maxConcurrency = value; + } + } + + /// + /// When the parallel operation is considered complete. Defaults to + /// — any single branch failure + /// surfaces as a when the parallel result + /// is awaited. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + + /// + /// How branches are represented in the checkpoint graph. Defaults to + /// . + /// + /// + /// is not yet supported in the .NET SDK and + /// will throw when the parallel + /// operation is invoked. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs new file mode 100644 index 000000000..77305ebef --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFailureToleranceTest +{ + private readonly ITestOutputHelper _output; + public ParallelFailureToleranceTest(ITestOutputHelper output) => _output = output; + + /// + /// Five branches, two fail, ToleratedFailureCount=1. The parallel must surface a + /// with reason + /// ; the workflow must + /// terminate FAILED. Validates the failure-tolerance short-circuit and that + /// ParallelException propagates as the workflow's terminal error. + /// + [Fact] + public async Task Parallel_FailureToleranceExceeded_FailsWorkflow() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFailureToleranceFunction"), + "ptol", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p3"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller — locate the + // execution by name to inspect its terminal status. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + // ParallelException is the terminal error type the SDK throws when the + // failure-tolerance short-circuit fires. + var errorType = execution.Error.ErrorType ?? string.Empty; + var errorMessage = execution.Error.ErrorMessage ?? string.Empty; + Assert.True( + errorType.Contains("ParallelException", StringComparison.Ordinal) + || errorMessage.Contains("Parallel", StringComparison.OrdinalIgnoreCase), + $"Expected error to indicate ParallelException; got type='{errorType}' message='{errorMessage}'"); + + // History: parent CONTEXT and at least 2 failed branch contexts visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.EventType == EventType.ContextFailed) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // At least 2 branches failed (the third may or may not have been + // dispatched depending on race; the parent CONTEXT itself also fails). + Assert.True( + events.Count(e => e.EventType == EventType.ContextFailed) >= 2, + $"Expected >= 2 ContextFailed events; got {events.Count(e => e.EventType == EventType.ContextFailed)}"); + + // The parent context (named "tolerance") records the aggregate failure. + var parentFailed = events.FirstOrDefault(e => + e.EventType == EventType.ContextFailed && e.Name == "tolerance"); + Assert.NotNull(parentFailed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs new file mode 100644 index 000000000..73d8eb685 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs @@ -0,0 +1,81 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFirstSuccessfulTest +{ + private readonly ITestOutputHelper _output; + public ParallelFirstSuccessfulTest(ITestOutputHelper output) => _output = output; + + /// + /// Four branches with staggered durable waits, FirstSuccessful: as + /// soon as one branch completes, the parallel resolves. In-flight branches + /// remain in rather than being + /// cancelled. Validates the cross-cutting decision: orphan branches are NOT + /// cancelled, and short-circuit reports them as Started. + /// + [Fact] + public async Task Parallel_FirstSuccessful_ShortCircuitsOnFirstWin() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFirstSuccessfulFunction"), + "pfirst", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p4"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Wait timer = 8s, plus invocation overhead. Generous timeout for + // CI variance. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The workflow's response payload reports the winning branch. + using var doc = JsonDocument.Parse(responsePayload); + var winnerIndex = doc.RootElement.GetProperty("winnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("winnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("completionReason").GetString(); + var successCount = doc.RootElement.GetProperty("successCount").GetInt32(); + + // At least one branch succeeded — the workflow short-circuited as soon + // as the first win materialised. + Assert.True(successCount >= 1, $"Expected >= 1 successful branch, got {successCount}"); + Assert.True(winnerIndex >= 0 && winnerIndex < 4, + $"WinnerIndex should be a valid branch index, got {winnerIndex}"); + Assert.NotNull(winnerName); + + // CompletionReason is MinSuccessfulReached only if some branch was left + // un-dispatched at the time the threshold was met. With unbounded + // concurrency every branch dispatches immediately, so the reason is + // AllCompleted (all dispatched branches finished). Either reason is + // acceptable — just ensure it isn't FailureToleranceExceeded. + Assert.NotEqual("FailureToleranceExceeded", completionReason); + + // Service-side: the parent CONTEXT and at least one branch CONTEXT + // succeeded. Other branches' final state is timing-dependent — they + // could be Started (left in flight) or Succeeded (completed before + // the parent's CONTEXT SUCCEED was flushed). The orchestrator + // deliberately does not cancel in-flight branches once the + // short-circuit fires. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded && e.Name == "race") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var parentSucceeded = events.FirstOrDefault(e => + e.EventType == EventType.ContextSucceeded && e.Name == "race"); + Assert.NotNull(parentSucceeded); + + // The winning branch's CONTEXT SUCCEEDED is in the history. + Assert.Contains(events, e => e.EventType == EventType.ContextSucceeded && e.Name == winnerName); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs new file mode 100644 index 000000000..0895f8796 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs @@ -0,0 +1,72 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelHappyPathTest +{ + private readonly ITestOutputHelper _output; + public ParallelHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path parallel: three branches run concurrently, each + /// produces a string, and the workflow returns the joined results. Validates + /// the parent CONTEXT and per-branch CONTEXT checkpoints all land in the + /// service-side history with the correct names and ordering. + /// + [Fact] + public async Task Parallel_AllBranchesSucceed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelHappyPathFunction"), + "phappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p1"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The user-visible payload contains all three branch outputs in + // declaration order (the SDK preserves index order even when branches + // race). + Assert.Contains("alpha-p1", responsePayload); + Assert.Contains("beta-p1", responsePayload); + Assert.Contains("gamma-p1", responsePayload); + + // History is eventually consistent — wait until the parent CONTEXT and + // all three child CONTEXT checkpoints are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 4, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Parent + 3 branches = 4 ContextStarted, 4 ContextSucceeded. + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextSucceeded)); + + // The three branches show up by name on their own ContextStarted events. + var startedNames = events + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Name) + .ToList(); + Assert.Contains("fanout", startedNames); + Assert.Contains("alpha", startedNames); + Assert.Contains("beta", startedNames); + Assert.Contains("gamma", startedNames); + + // No branch failed. + Assert.Empty(events.Where(e => e.EventType == EventType.ContextFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs new file mode 100644 index 000000000..c5fbf14eb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs @@ -0,0 +1,76 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelMaxConcurrencyTest +{ + private readonly ITestOutputHelper _output; + public ParallelMaxConcurrencyTest(ITestOutputHelper output) => _output = output; + + /// + /// 6 branches, each with a 2-second durable wait, MaxConcurrency = 2. + /// Validates the semaphore actually throttles dispatch: timestamps must + /// cluster into 3 waves of 2 (not all six firing simultaneously). Timing + /// tolerance is intentionally generous (±2s per wave gap) to avoid CI + /// flakiness; if the wave-clustering proves flaky, fall back to + /// "all 6 succeeded". + /// + [Fact] + public async Task Parallel_MaxConcurrency_ThrottlesBranchDispatch() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelMaxConcurrencyFunction"), + "pmaxc", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p5"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 waves x 2s waits + invocation overhead. Allow generous headroom + // for service scheduling latency. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("successCount").GetInt32(); + Assert.Equal(6, successCount); + + var timestamps = doc.RootElement.GetProperty("timestamps") + .EnumerateArray().Select(t => t.GetInt64()).ToList(); + Assert.Equal(6, timestamps.Count); + + // Sort timestamps and check whether they cluster into 3 groups of 2. + // Wave-N timestamps should be roughly 2s apart from wave-(N-1). + // Use generous tolerance (±1500ms within a wave; >= 800ms gap between + // waves) — service-driven invocations have observable jitter. + var sorted = timestamps.OrderBy(t => t).ToList(); + var minTs = sorted[0]; + var relative = sorted.Select(t => t - minTs).ToList(); + _output.WriteLine($"Relative timestamps (ms): {string.Join(", ", relative)}"); + + // Tolerant clustering: split timestamps by 1500ms gaps. With + // MaxConcurrency=2 and 2s waits, we expect at least 2 distinct waves. + // Strict 3-wave clustering can be flaky due to service jitter, so we + // assert the weaker (but still meaningful) property: not all 6 + // branches fired in the same wave. + var firstWave = relative.Where(r => r < 1500).Count(); + Assert.True(firstWave <= 3, + $"Expected MaxConcurrency=2 to limit the first wave to ~2 branches; got {firstWave} within 1500ms of start. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + + // The full set must span at least one wave-gap (~2s) — i.e., total + // elapsed must exceed ~2s, proving branches did NOT all run at once. + var total = sorted[^1] - sorted[0]; + Assert.True(total >= 1500, + $"Expected branches to span >= 1500ms (proves throttling); got {total}ms. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs new file mode 100644 index 000000000..839c46b36 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs @@ -0,0 +1,74 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelPartialFailureTest +{ + private readonly ITestOutputHelper _output; + public ParallelPartialFailureTest(ITestOutputHelper output) => _output = output; + + /// + /// Three branches, one throws, two succeed. With CompletionConfig.AllCompleted() + /// the parallel does NOT throw — it surfaces success/failure counts and the + /// per-branch errors. Validates per-branch error preservation through the + /// service round-trip and back into the rebuilt . + /// + [Fact] + public async Task Parallel_PartialFailure_AllCompleted_ReportsCounts() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelPartialFailureFunction"), + "ppartial", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p2"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + // AllCompleted means partial failure is NOT a workflow failure — the + // user accepted the failure and returned a result. + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // Decode the workflow result payload and verify the counts surface correctly. + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("successCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("failureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("errorSummary").GetString(); + + Assert.Equal(2, successCount); + Assert.Equal(1, failureCount); + Assert.NotNull(errorSummary); + // The originating exception type is captured on the rebuilt + // ChildContextException when reconstructing the batch. + Assert.Contains("intentional partial failure", errorSummary); + + // History: 1 parent + 3 branches = 4 ContextStarted; 3 ContextSucceeded + // (parent + 2 ok branches); 1 ContextFailed (the boom branch). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(3, events.Count(e => e.EventType == EventType.ContextSucceeded)); + Assert.Equal(1, events.Count(e => e.EventType == EventType.ContextFailed)); + + // The failing branch's checkpoint preserves the exception message. + var failedEvent = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed); + Assert.NotNull(failedEvent); + Assert.Equal("boom", failedEvent!.Name); + Assert.Contains("intentional partial failure", + failedEvent.ContextFailedDetails?.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs new file mode 100644 index 000000000..1ad44790a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs @@ -0,0 +1,122 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public ParallelReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// Each branch's operation ID must equal SHA-256(parentOpId + "-" + (index+1)) + /// (matching the OperationIdGenerator's CreateChild contract). Reproduced + /// locally because OperationIdGenerator is internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// Three parallel branches, each containing a step + a durable wait + /// (the wait forces a suspend/resume cycle so the parallel actually + /// replays). Verifies: + /// 1. The branch operation IDs match the deterministic + /// SHA256("<parentId>-<n>") formula (the same one used + /// by OperationIdGenerator.CreateChild and the reference Java/JS/Python SDKs). + /// 2. Each branch's user-visible step result is preserved across replay + /// (the GUID generated inside generate survives suspend/resume). + /// + [Fact] + public async Task Parallel_BranchOperationIds_AreDeterministic_AcrossReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelReplayDeterminismFunction"), + "preplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p6"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var expectedBranchIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + + // Wait until each branch's CONTEXT SUCCEEDED is visible AND each + // branch's step/wait events are visible (they live under the branch + // operation IDs). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + // Parent + 3 branch CONTEXTs all succeeded. + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 4) return false; + // Each branch ran one step and one wait => 3 step succeeds + 3 wait succeeds. + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Branch operation IDs match the deterministic hash. + var branchStartedEvents = allEvents + .Where(e => e.EventType == EventType.ContextStarted && e.Id != null && e.Id != parentOpId) + .ToList(); + var observedBranchIds = branchStartedEvents.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedBranchIds.Count); + foreach (var expected in expectedBranchIds) + { + Assert.Contains(expected, observedBranchIds); + } + + // 2. Every step under a branch parents to that branch's deterministic ID + // (proves the child generator's ID space is correctly seeded). + var branchSucceededEvents = allEvents + .Where(e => e.EventType == EventType.ContextSucceeded && e.Name != "fanout") + .ToList(); + Assert.Equal(3, branchSucceededEvents.Count); + + // 3. Each branch's "generate" step succeeded exactly once — proving + // replay returned the cached step result rather than re-executing. + // (Re-execution would manifest as duplicate StepSucceeded events for + // the same operation ID.) + var stepSucceededEvents = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, stepSucceededEvents.Count); + + // 4. The wait events span at least 2 invocations: one to schedule each + // wait, and at least one to resume after the timer fires. This proves + // replay actually happened. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response contains 3 valid GUIDs separated by commas + // (proving the per-branch step result survived replay). + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs new file mode 100644 index 000000000..9c697710d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs @@ -0,0 +1,60 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Five branches, two throw. ToleratedFailureCount = 1 means a second + // failure exceeds tolerance and the parallel surfaces a ParallelException. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("ok1", async (_) => { await Task.CompletedTask; return "1"; }), + new DurableBranch("bad1", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("bad1 boom"); + }), + new DurableBranch("ok2", async (_) => { await Task.CompletedTask; return "2"; }), + new DurableBranch("bad2", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("bad2 boom"); + }), + new DurableBranch("ok3", async (_) => { await Task.CompletedTask; return "3"; }), + }, + name: "tolerance", + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Should not reach here — the parallel must throw ParallelException. + return new TestResult { Status = "should_not_reach", SuccessCount = batch.SuccessCount }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs new file mode 100644 index 000000000..2fa932dd7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs @@ -0,0 +1,79 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Four branches with different durable wait durations. The shortest + // wait should win and short-circuit the parallel via FirstSuccessful. + // Wait durations are at least 1s (service timer granularity). + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("slowest", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(8), name: "wait_3"); + return 3; + }), + new DurableBranch("fastest", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(1), name: "wait_0"); + return 0; + }), + new DurableBranch("mid1", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(5), name: "wait_1"); + return 1; + }), + new DurableBranch("mid2", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(6), name: "wait_2"); + return 2; + }), + }, + name: "race", + config: new ParallelConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + // The winner is whichever branch came back first. Surface the index + + // its name so the test can assert one branch won. + var winner = batch.Succeeded.FirstOrDefault(); + return new TestResult + { + Status = "completed", + WinnerIndex = winner?.Index ?? -1, + WinnerName = winner?.Name, + CompletionReason = batch.CompletionReason.ToString(), + SuccessCount = batch.SuccessCount, + StartedCount = batch.StartedCount + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int WinnerIndex { get; set; } + public string? WinnerName { get; set; } + public string? CompletionReason { get; set; } + public int SuccessCount { get; set; } + public int StartedCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs new file mode 100644 index 000000000..b6b027f9b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs @@ -0,0 +1,40 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("alpha", async (_) => { await Task.CompletedTask; return $"alpha-{input.OrderId}"; }), + new DurableBranch("beta", async (_) => { await Task.CompletedTask; return $"beta-{input.OrderId}"; }), + new DurableBranch("gamma", async (_) => { await Task.CompletedTask; return $"gamma-{input.OrderId}"; }), + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs new file mode 100644 index 000000000..72f69913a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs @@ -0,0 +1,67 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // 6 branches, MaxConcurrency = 2. Each branch does a 2-second durable + // wait then captures the post-wait wall-clock as a unix-ms timestamp. + // The expected outcome is 3 waves of 2 branches; total elapsed ~6s. + // Use IDurableContext.WaitAsync (not Task.Delay) — Task.Delay is NOT + // durable and would skew this measurement under replay. + var branches = new DurableBranch[6]; + for (var i = 0; i < 6; i++) + { + var localIndex = i; + branches[i] = new DurableBranch( + $"b{localIndex}", + async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: $"wait_{localIndex}"); + return DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + }); + } + + var batch = await context.ParallelAsync( + branches, + name: "throttled", + config: new ParallelConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + Timestamps = batch.GetResults().ToArray() + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public long[]? Timestamps { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs new file mode 100644 index 000000000..51b35f19b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("ok1", async (_) => { await Task.CompletedTask; return "first"; }), + new DurableBranch("boom", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional partial failure"); + }), + new DurableBranch("ok2", async (_) => { await Task.CompletedTask; return "third"; }), + }, + name: "partial", + // AllCompleted: drive every branch to terminal state regardless of failure. + // Without this, the default AllSuccessful() would throw on the first failure. + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + var errors = batch.GetErrors(); + var errorSummary = string.Join("|", errors.Select(e => $"{e.GetType().Name}:{e.Message}")); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + FailureCount = batch.FailureCount, + ErrorSummary = errorSummary + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public int FailureCount { get; set; } + public string? ErrorSummary { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..195c9b497 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs @@ -0,0 +1,57 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches. Each branch generates a fresh GUID inside a step, + // then does a durable wait. The wait forces a suspend/resume cycle, + // so the second invocation MUST replay the cached GUID rather than + // re-running the step. If replay determinism is broken, the GUID + // would change between the original execution and replay. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", BranchAsync), + new DurableBranch("b", BranchAsync), + new DurableBranch("c", BranchAsync), + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } + + private static async Task BranchAsync(IDurableContext ctx) + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs new file mode 100644 index 000000000..95d9cef40 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -0,0 +1,1037 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ParallelOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ────────────────────────────────────────────────────────────────────── + // Public surface — basic happy paths + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_FreshExecution_AllBranchesSucceed() + { + var (context, recorder, tm, _) = CreateContext(); + + var branches = new Func>[] + { + async (ctx) => { await Task.Yield(); return 10; }, + async (ctx) => { await Task.Yield(); return 20; }, + async (ctx) => { await Task.Yield(); return 30; }, + }; + + var result = await context.ParallelAsync(branches, name: "fanout"); + + Assert.False(tm.IsTerminated); + Assert.Equal(3, result.TotalCount); + Assert.Equal(3, result.SuccessCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(0, result.StartedCount); + Assert.False(result.HasFailure); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT START + 3 child CONTEXT STARTs + 3 child CONTEXT SUCCEEDs + Parent CONTEXT SUCCEED + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(8, contextActions.Length); + Assert.Equal("Parallel:START", contextActions[0]); + Assert.Equal("Parallel:SUCCEED", contextActions[^1]); + } + + [Fact] + public async Task ParallelAsync_PreservesIndexOrder_EvenWhenBranchesCompleteOutOfOrder() + { + var (context, _, _, _) = CreateContext(); + + var branches = new Func>[] + { + async (ctx) => { await Task.Delay(40); return 1; }, + async (ctx) => { await Task.Delay(10); return 2; }, + async (ctx) => { await Task.Delay(20); return 3; }, + }; + + var result = await context.ParallelAsync(branches); + + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + for (var i = 0; i < result.All.Count; i++) + { + Assert.Equal(i, result.All[i].Index); + } + } + + [Fact] + public async Task ParallelAsync_BranchOperationIds_AreDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return "a"; }, + async (_) => { await Task.Yield(); return "b"; }, + }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstBranchId = ChildIdAt(parentOpId, 1); + var secondBranchId = ChildIdAt(parentOpId, 2); + + // Each branch's CONTEXT START should hit the deterministic child ID. + var branchStarts = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "START") + .ToArray(); + Assert.Equal(2, branchStarts.Length); + Assert.Contains(branchStarts, o => o.Id == firstBranchId); + Assert.Contains(branchStarts, o => o.Id == secondBranchId); + } + + [Fact] + public async Task ParallelAsync_NamedBranches_PropagateNameToCheckpointAndItem() + { + var (context, recorder, _, _) = CreateContext(); + + var branches = new[] + { + new DurableBranch("alpha", async (_) => { await Task.Yield(); return 1; }), + new DurableBranch("beta", async (_) => { await Task.Yield(); return 2; }), + }; + + var result = await context.ParallelAsync(branches, name: "fanout"); + + Assert.Equal("alpha", result.All[0].Name); + Assert.Equal("beta", result.All[1].Name); + + await recorder.Batcher.DrainAsync(); + + var branchSucceeds = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "SUCCEED") + .ToArray(); + Assert.Contains(branchSucceeds, o => o.Name == "alpha"); + Assert.Contains(branchSucceeds, o => o.Name == "beta"); + } + + [Fact] + public async Task ParallelAsync_UnnamedOverload_DefaultsToIndexAsName() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }); + + Assert.Equal("0", result.All[0].Name); + Assert.Equal("1", result.All[1].Name); + } + + [Fact] + public async Task ParallelAsync_EmptyBranches_ReturnsEmptyResultWithAllCompleted() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync(Array.Empty>>()); + + Assert.Equal(0, result.TotalCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Even the empty case still flushes parent START + parent SUCCEED. + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(new[] { "Parallel:START", "Parallel:SUCCEED" }, contextActions); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — failure tolerance + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_AllSuccessfulDefault_OneFailureThrowsParallelException() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("branch boom"); }, + async (_) => { await Task.Yield(); return 3; }, + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + Assert.Equal(2, typed.SuccessCount); + } + + [Fact] + public async Task ParallelAsync_AllCompleted_PartialFailureDoesNotThrow() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("oops"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + + var errors = result.GetErrors(); + Assert.Single(errors); + Assert.Contains("oops", errors[0].Message); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailureCount_AllowsUpToThreshold() + { + var (context, _, _, _) = CreateContext(); + + // 4 branches, 2 fail; tolerated = 2 (>= failures), so resolves without + // throwing. + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-1"); }, + async (_) => { await Task.Yield(); return 3; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-2"); }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 2 } + }); + + Assert.Equal(2, result.FailureCount); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailureCount_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-1"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-2"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailurePercentage_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + // 4 branches, 3 fail (75%) > 0.5 (50%) → exceeded. + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("f1"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("f2"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("f3"); }, + async (_) => { await Task.Yield(); return 4; }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailurePercentage = 0.5 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + [Fact] + public void CompletionConfig_ToleratedFailurePercentage_OutOfRange_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.ToleratedFailurePercentage = 1.5); + Assert.Throws(() => config.ToleratedFailurePercentage = -0.1); + // boundary values are accepted + config.ToleratedFailurePercentage = 0.0; + config.ToleratedFailurePercentage = 1.0; + config.ToleratedFailurePercentage = null; + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — first-successful short-circuit + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_FirstSuccessful_ResolvesAfterFirstSuccess() + { + var (context, _, _, _) = CreateContext(); + + // MaxConcurrency = 1 so we know the dispatch order is deterministic: + // branch 0 fires first and succeeds; branches 1 and 2 are never + // dispatched at all, so they remain in BatchItemStatus.Started. + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig + { + MaxConcurrency = 1, + CompletionConfig = CompletionConfig.FirstSuccessful() + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(1, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(3, result.TotalCount); + + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + } + + [Fact] + public async Task ParallelAsync_MinSuccessful_ResolvesWhenTargetReached() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + async (_) => { await Task.Yield(); return 4; }, + }, + config: new ParallelConfig + { + MaxConcurrency = 1, + CompletionConfig = new CompletionConfig { MinSuccessful = 2 } + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + } + + // ────────────────────────────────────────────────────────────────────── + // MaxConcurrency + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_MaxConcurrency_LimitsInFlight() + { + var (context, _, _, _) = CreateContext(); + + var inFlight = 0; + var maxObserved = 0; + var lockObj = new object(); + + var branches = new Func>[] + { + MakeBranch(), + MakeBranch(), + MakeBranch(), + MakeBranch(), + MakeBranch(), + }; + + var result = await context.ParallelAsync(branches, config: new ParallelConfig { MaxConcurrency = 2 }); + + Assert.Equal(5, result.SuccessCount); + Assert.True(maxObserved <= 2, $"Observed concurrency {maxObserved} exceeded MaxConcurrency = 2"); + + Func> MakeBranch() + { + return async (_) => + { + lock (lockObj) + { + inFlight++; + if (inFlight > maxObserved) maxObserved = inFlight; + } + await Task.Delay(20); + lock (lockObj) inFlight--; + return 1; + }; + } + } + + [Fact] + public void ParallelConfig_MaxConcurrency_OutOfRange_Throws() + { + var config = new ParallelConfig(); + Assert.Throws(() => config.MaxConcurrency = 0); + Assert.Throws(() => config.MaxConcurrency = -1); + config.MaxConcurrency = 1; + config.MaxConcurrency = null; + } + + // ────────────────────────────────────────────────────────────────────── + // NestingType + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ThrowsNotSupported() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] { async (_) => { await Task.Yield(); return 1; } }, + config: new ParallelConfig { NestingType = NestingType.Flat })); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Branches":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","OperationId":"placeholder0"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","OperationId":"placeholder1"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "100" } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails { Result = "200" } + } + } + }); + + var executed = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { executed = true; await Task.Yield(); return 999; }, + async (_) => { executed = true; await Task.Yield(); return 999; }, + }, + name: "fanout"); + + Assert.False(executed); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_ReplayFailed_ThrowsParallelException() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Branches":[ + {"Index":0,"Name":"0","Status":"FAILED","OperationId":"placeholder0"}, + {"Index":1,"Name":"1","Status":"FAILED","OperationId":"placeholder1"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "branch 0 failed" + } + } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "branch 1 failed" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }, + name: "fanout")); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + + var typed = (IBatchResult)ex.Result!; + Assert.Equal(2, typed.FailureCount); + Assert.Contains("branch 0 failed", typed.GetErrors()[0].Message); + } + + [Fact] + public async Task ParallelAsync_ReplayStarted_ReExecutesBranches() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + SubType = OperationSubTypes.Parallel, + Name = "fanout" + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "11" } + } + } + }); + + var calls = new int[2]; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { calls[0]++; await Task.Yield(); return 99; }, + async (_) => { calls[1]++; await Task.Yield(); return 22; }, + }, + name: "fanout"); + + // Branch 0 replays cached value (not re-executed); branch 1 runs fresh. + Assert.Equal(0, calls[0]); + Assert.Equal(1, calls[1]); + Assert.Equal(new[] { 11, 22 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Critical: do NOT re-checkpoint parent CONTEXT START (the original + // STARTED record is still authoritative). + var parentStarts = recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "Parallel" && o.Action == "START").ToArray(); + Assert.Empty(parentStarts); + } + + [Fact] + public async Task ParallelAsync_ReplayUnknownStatus_ThrowsNonDeterministic() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = "BOGUS", + SubType = OperationSubTypes.Parallel, + Name = "fanout" + } + } + }); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] { async (_) => { await Task.Yield(); return 1; } }, + name: "fanout")); + } + + // ────────────────────────────────────────────────────────────────────── + // IBatchResult helpers + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task BatchResult_ThrowIfError_ThrowsFirstError() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("kaboom"); }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + var ex = Assert.Throws(() => result.ThrowIfError()); + Assert.Contains("kaboom", ex.Message); + } + + [Fact] + public async Task BatchResult_GetResults_SkipsFailedAndStartedItems() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 10; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("ouch"); }, + async (_) => { await Task.Yield(); return 30; }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.Equal(new[] { 10, 30 }, result.GetResults()); + } + + [Fact] + public async Task BatchResult_AllSucceededFailedStarted_AreInOriginalIndexOrder() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, // index 0 succeed + async (_) => { await Task.Yield(); throw new InvalidOperationException("bad-1"); }, // index 1 fail + async (_) => { await Task.Yield(); return 3; }, // index 2 succeed + async (_) => { await Task.Yield(); throw new InvalidOperationException("bad-3"); }, // index 3 fail + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.Equal(new[] { 0, 2 }, result.Succeeded.Select(i => i.Index).ToArray()); + Assert.Equal(new[] { 1, 3 }, result.Failed.Select(i => i.Index).ToArray()); + Assert.Empty(result.Started); + } + + // ────────────────────────────────────────────────────────────────────── + // Argument validation + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_NullBranches_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.ParallelAsync((IReadOnlyList>>)null!)); + } + + [Fact] + public async Task ParallelAsync_NullBranchInList_Throws() + { + var (context, _, _, _) = CreateContext(); + + var branches = new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + null!, + }; + + await Assert.ThrowsAsync(() => context.ParallelAsync(branches)); + } + + // ────────────────────────────────────────────────────────────────────── + // Concurrency / cancellation regressions (Critical 1, Critical 2) + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_CancelMidDispatch_AllBranchesSettleAndNoObjectDisposed() + { + // Regression for orphan-branch bug: dispatch 5 branches with + // MaxConcurrency=2; cancel parent CancellationToken right after the + // first batch starts so the dispatcher's semaphore.WaitAsync trips + // OperationCanceledException mid-loop. With the old code branches in + // flight at cancellation time would Release on a disposed semaphore + // and fault as ObjectDisposedException. With the fix the semaphore + // dispose is gated on Task.WhenAll over inFlight, so every dispatched + // task settles cleanly first. + var (context, _, _, _) = CreateContext(); + + using var cts = new CancellationTokenSource(); + var dispatchedReady = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var dispatchedCount = 0; + var lockObj = new object(); + var capturedExceptions = new List(); + var unobservedCount = 0; + + EventHandler handler = (_, args) => + { + lock (lockObj) + { + Interlocked.Increment(ref unobservedCount); + capturedExceptions.Add(args.Exception); + } + }; + TaskScheduler.UnobservedTaskException += handler; + + try + { + var branches = new Func>[5]; + for (var i = 0; i < 5; i++) + { + branches[i] = async (_) => + { + int n; + lock (lockObj) n = ++dispatchedCount; + if (n == 2) dispatchedReady.TrySetResult(); + // Hold the branch long enough that cancellation arrives + // while we're in flight. + try { await Task.Delay(200, cts.Token).ConfigureAwait(false); } + catch (OperationCanceledException) { /* cooperatively stop */ } + return n; + }; + } + + var run = context.ParallelAsync( + branches, + config: new ParallelConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }, + cancellationToken: cts.Token); + + // Wait until 2 branches are running, then cancel — this trips + // the dispatcher on its next semaphore.WaitAsync call. + await dispatchedReady.Task.WaitAsync(TimeSpan.FromSeconds(5)); + cts.Cancel(); + + // The orchestrator should surface OperationCanceledException + // cleanly (NOT ObjectDisposedException) once the in-flight + // branches settle. + var ex = await Assert.ThrowsAnyAsync(() => run); + Assert.IsNotType(ex); + + // Force GC + finalizers so any unobserved exceptions surface. + GC.Collect(); + GC.WaitForPendingFinalizers(); + GC.Collect(); + + Assert.Equal(0, Volatile.Read(ref unobservedCount)); + foreach (var captured in capturedExceptions) + { + Assert.IsNotType(captured); + } + } + finally + { + TaskScheduler.UnobservedTaskException -= handler; + } + } + + [Fact] + public void ExecutionState_ConcurrentTrackReplayAndValidate_NoExceptionsAndConsistent() + { + // Regression for ExecutionState race: 16 tasks call TrackReplay / + // ValidateReplayConsistency / GetOperation concurrently. With the + // unguarded Dictionary/HashSet collections this would either throw + // InvalidOperationException (concurrent enumeration) or produce + // torn reads. Under the lock the ops are serialized and consistent. + var state = new ExecutionState(); + var ops = new List(); + var ids = new List(); + for (var i = 0; i < 50; i++) + { + var id = $"op-{i}"; + ids.Add(id); + ops.Add(new Operation + { + Id = id, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = $"name-{i}" + }); + } + state.LoadFromCheckpoint(new InitialExecutionState { Operations = ops }); + + var caught = new List(); + var caughtLock = new object(); + var tasks = new Task[16]; + for (var t = 0; t < 16; t++) + { + var seed = t; + tasks[t] = Task.Run(() => + { + try + { + var rng = new Random(seed); + for (var iter = 0; iter < 200; iter++) + { + var id = ids[rng.Next(ids.Count)]; + state.TrackReplay(id); + state.ValidateReplayConsistency(id, OperationTypes.Context, $"name-{id.Substring(3)}"); + _ = state.GetOperation(id); + _ = state.HasOperation(id); + _ = state.IsReplaying; + } + } + catch (Exception ex) + { + lock (caughtLock) caught.Add(ex); + } + }); + } + + Task.WaitAll(tasks, TimeSpan.FromSeconds(30)); + Assert.Empty(caught); + + // Once every terminal op has been visited, IsReplaying must be false. + Assert.False(state.IsReplaying); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay determinism / failure modes / mixed-status replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_ReplayDeterminism_SameWorkflowProducesSameBranchIds() + { + // Run the same workflow shape twice from scratch and assert the + // branch CONTEXT START IDs are byte-identical. This pins the + // determinism contract: the n-th branch's hashed ID is a pure + // function of (root counter position, branch index). + async Task RunOnce() + { + var (context, recorder, _, _) = CreateContext(); + await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + }, + name: "fanout"); + await recorder.Batcher.DrainAsync(); + return recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "START") + .Select(o => o.Id!) + .OrderBy(s => s) + .ToArray(); + } + + var run1Ids = await RunOnce(); + var run2Ids = await RunOnce(); + + Assert.Equal(3, run1Ids.Length); + Assert.Equal(run1Ids, run2Ids); + } + + [Fact] + public async Task ParallelAsync_FirstSuccessful_AllFail_AggregatesAsParallelException() + { + // FirstSuccessful() aliases MinSuccessful=1 with no explicit failure + // tolerance. When every branch fails, MinSuccessful is unreachable + // AND there is no failure-tolerance threshold, so the run completes + // as AllCompleted with HasFailure=true. Calling ThrowIfError surfaces + // the first failure; without explicit failure tolerance the parallel + // does NOT throw on its own (matches Python). + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("a"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("b"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("c"); }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(0, result.SuccessCount); + Assert.Equal(3, result.FailureCount); + Assert.True(result.HasFailure); + + // Caller-driven aggregation: ThrowIfError surfaces the first failure. + var ex = Assert.Throws(() => result.ThrowIfError()); + Assert.Contains("a", ex.Message); + } + + [Fact] + public async Task ParallelAsync_ReplayMixedStatus_PreservesStartedShortCircuited() + { + // Parent SUCCEEDED with MinSuccessful short-circuit: branch 0 + // SUCCEEDED, branch 1 SUCCEEDED, branch 2 was never dispatched + // (still STARTED in the summary). Replay must reproduce the original + // BatchResult shape — including the un-dispatched STARTED entry — + // without re-executing any branch. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Branches":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "10" } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails { Result = "20" } + } + // Branch 2 has no checkpoint at all — it was never dispatched. + } + }); + + var calls = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { calls++; await Task.Yield(); return 999; }, + async (_) => { calls++; await Task.Yield(); return 999; }, + async (_) => { calls++; await Task.Yield(); return 999; }, + }, + name: "fanout"); + + Assert.Equal(0, calls); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.StartedCount); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + +} From 016a60e9239f686686613c7f12a882168d4f73cd Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 12:25:47 -0400 Subject: [PATCH 02/21] fix tests --- .../Internal/ParallelOperation.cs | 5 +++-- .../Internal/ParallelSummary.cs | 7 ------- .../ParallelFirstSuccessfulTest.cs | 8 ++++---- .../ParallelMaxConcurrencyTest.cs | 4 ++-- .../ParallelPartialFailureTest.cs | 6 +++--- 5 files changed, 12 insertions(+), 18 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs index f81d0d19b..5fdd3a2b1 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -1,6 +1,7 @@ using System.IO; using System.Text; using System.Text.Json; +using Amazon.Lambda; using Amazon.Lambda.Core; using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; @@ -75,7 +76,7 @@ await EnqueueAsync(new SdkOperationUpdate { Id = OperationId, Type = OperationTypes.Context, - Action = "START", + Action = OperationAction.START, SubType = OperationSubTypes.Parallel, Name = Name }, cancellationToken); @@ -473,7 +474,7 @@ await EnqueueAsync(new SdkOperationUpdate { Id = OperationId, Type = OperationTypes.Context, - Action = failed ? "FAIL" : "SUCCEED", + Action = failed ? OperationAction.FAIL : OperationAction.SUCCEED, SubType = OperationSubTypes.Parallel, Name = Name, Payload = failed ? null : payload, diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs index ca75955b1..15b4e4f71 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs @@ -28,11 +28,4 @@ internal sealed class ParallelBranchSummary [JsonPropertyName("Status")] public string? Status { get; set; } - - // Note: there used to be an OperationId field here, but the replay path - // recomputes the deterministic branch ID from the parent ID + index - // (HashOperationId($"{parentOpId}-{i + 1}")). Carrying the ID on the - // wire was redundant and never read on replay; removed to reduce - // checkpoint size. If the hashing strategy ever changes we'll need a - // versioned recovery path, but that's a separate concern. } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs index 73d8eb685..fedc538fb 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs @@ -40,10 +40,10 @@ public async Task Parallel_FirstSuccessful_ShortCircuitsOnFirstWin() // The workflow's response payload reports the winning branch. using var doc = JsonDocument.Parse(responsePayload); - var winnerIndex = doc.RootElement.GetProperty("winnerIndex").GetInt32(); - var winnerName = doc.RootElement.GetProperty("winnerName").GetString(); - var completionReason = doc.RootElement.GetProperty("completionReason").GetString(); - var successCount = doc.RootElement.GetProperty("successCount").GetInt32(); + var winnerIndex = doc.RootElement.GetProperty("WinnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("WinnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("CompletionReason").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); // At least one branch succeeded — the workflow short-circuited as soon // as the first win materialised. diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs index c5fbf14eb..e228cdc22 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs @@ -40,10 +40,10 @@ public async Task Parallel_MaxConcurrency_ThrottlesBranchDispatch() Assert.Equal("SUCCEEDED", status, ignoreCase: true); using var doc = JsonDocument.Parse(responsePayload); - var successCount = doc.RootElement.GetProperty("successCount").GetInt32(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); Assert.Equal(6, successCount); - var timestamps = doc.RootElement.GetProperty("timestamps") + var timestamps = doc.RootElement.GetProperty("Timestamps") .EnumerateArray().Select(t => t.GetInt64()).ToList(); Assert.Equal(6, timestamps.Count); diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs index 839c46b36..28adf7549 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs @@ -39,9 +39,9 @@ public async Task Parallel_PartialFailure_AllCompleted_ReportsCounts() // Decode the workflow result payload and verify the counts surface correctly. using var doc = JsonDocument.Parse(responsePayload); - var successCount = doc.RootElement.GetProperty("successCount").GetInt32(); - var failureCount = doc.RootElement.GetProperty("failureCount").GetInt32(); - var errorSummary = doc.RootElement.GetProperty("errorSummary").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("FailureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("ErrorSummary").GetString(); Assert.Equal(2, successCount); Assert.Equal(1, failureCount); From d3543f22156262f8318e8361542e44033842dd69 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 12:35:47 -0400 Subject: [PATCH 03/21] change file --- .autover/changes/durable-parallelasync.json | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .autover/changes/durable-parallelasync.json diff --git a/.autover/changes/durable-parallelasync.json b/.autover/changes/durable-parallelasync.json new file mode 100644 index 000000000..2adf78331 --- /dev/null +++ b/.autover/changes/durable-parallelasync.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add `ParallelAsync` to `IDurableContext` for running multiple workflow branches concurrently with automatic checkpointing. Supports configurable max concurrency, failure tolerance, and first-successful completion via `ParallelConfig`, returning an `IBatchResult`." + ] + } + ] +} From 1a1d5bc23bf233ba5d674ba886f277e0f362b0c9 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 12:50:14 -0400 Subject: [PATCH 04/21] Validate CompletionConfig thresholds and honor checkpointed branch names - Add range validation to CompletionConfig.MinSuccessful (>= 1) and ToleratedFailureCount (>= 0), matching the existing ToleratedFailurePercentage setter. Previously zero/negative values produced nonsensical immediate short-circuits. - ReconstructFromCheckpoints now uses the branch Name persisted in the parallel summary instead of always reading the current branch name, and throws NonDeterministicExecutionException on name drift between deployments (the prior path silently ignored summaryEntry.Name). - Correct XML docs for BatchItemStatus.Started / IBatchResult.Started / CompletionConfig.FirstSuccessful: Started means a branch was not dispatched before a completion short-circuit fired (or has no checkpoint on replay), not that it is still running. --- .../BatchItemStatus.cs | 11 +- .../CompletionConfig.cs | 44 +++++- .../IBatchResult.cs | 4 +- .../Internal/ParallelOperation.cs | 18 ++- .../ParallelOperationTests.cs | 126 ++++++++++++++++++ 5 files changed, 191 insertions(+), 12 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs index e07aa4f4c..fdba62d64 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs @@ -6,8 +6,9 @@ namespace Amazon.Lambda.DurableExecution; /// /// Mirrors the wire-state of the per-branch checkpoint at the moment the batch /// resolved. Items that finished produce or -/// ; items still in flight when the batch's -/// short-circuits remain in . +/// ; items that were not dispatched because a +/// short-circuit fired are reported as +/// . /// public enum BatchItemStatus { @@ -22,9 +23,9 @@ public enum BatchItemStatus Failed, /// - /// The branch was still in flight when the batch's - /// resolved (e.g., returned - /// before this branch finished). + /// The branch was not dispatched before the batch's + /// resolved (e.g., short-circuited + /// before this branch was started), or no per-branch checkpoint exists on replay. /// Started } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs index 27a15d060..b31873f67 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs @@ -12,13 +12,32 @@ namespace Amazon.Lambda.DurableExecution; /// public sealed class CompletionConfig { + private int? _minSuccessful; + private int? _toleratedFailureCount; private double? _toleratedFailurePercentage; /// /// Minimum number of items required /// before the operation resolves successfully. null = no minimum. /// - public int? MinSuccessful { get; set; } + /// + /// Thrown by the setter if the value is less than 1. A minimum of + /// zero (or negative) would resolve the operation immediately without + /// dispatching any branch. + /// + public int? MinSuccessful + { + get => _minSuccessful; + set + { + if (value is { } v && v < 1) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MinSuccessful must be at least 1."); + } + _minSuccessful = value; + } + } /// /// Maximum tolerated count. When the @@ -26,7 +45,23 @@ public sealed class CompletionConfig /// with . /// null = no count-based failure threshold. /// - public int? ToleratedFailureCount { get; set; } + /// + /// Thrown by the setter if the value is negative. A negative tolerance + /// would fail the operation immediately without dispatching any branch. + /// + public int? ToleratedFailureCount + { + get => _toleratedFailureCount; + set + { + if (value is { } v && v < 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "ToleratedFailureCount must be zero or greater."); + } + _toleratedFailureCount = value; + } + } /// /// Maximum tolerated failure ratio, expressed as a value in the range @@ -68,8 +103,9 @@ public double? ToleratedFailurePercentage public static CompletionConfig AllCompleted() => new(); /// - /// Resolve as soon as one branch succeeds. Remaining in-flight branches are - /// reported as . + /// Resolve once at least one branch has succeeded. Branches that were not + /// dispatched before the completion criteria was met are reported as + /// . /// public static CompletionConfig FirstSuccessful() => new() { MinSuccessful = 1 }; } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs index baa5139d6..90d7e14b7 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs @@ -58,8 +58,8 @@ public interface IBatchResult : IBatchResult IReadOnlyList> Failed { get; } /// - /// Items still in flight when the batch resolved (a - /// short-circuit fired before they finished), + /// Items that were not dispatched when the batch resolved (a + /// short-circuit fired before they were started), /// in original index order. /// IReadOnlyList> Started { get; } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs index 5fdd3a2b1..05511cf62 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -497,6 +497,22 @@ private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwO ? DeserializeStatus(summaryEntry.Status) : InferStatusFromBranchOp(branchOp); + // Prefer the name that was checkpointed at the moment the batch + // resolved. This is the only authoritative source for branches + // reported as Started (no per-branch checkpoint exists to consult), + // and it lets us detect branch-name drift between deployments. + var currentName = _branches[i].Name; + var checkpointedName = summaryEntry?.Name; + if (checkpointedName != null && currentName != null && checkpointedName != currentName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for parallel branch {i} of operation " + + $"'{Name ?? OperationId}': expected name '{currentName}' but found '{checkpointedName}' " + + $"from a previous invocation. Code must not change the order or name of parallel " + + $"branches between deployments."); + } + var itemName = checkpointedName ?? currentName; + T? branchResult = default; DurableExecutionException? branchError = null; @@ -519,7 +535,7 @@ private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwO items.Add(new BatchItem { Index = i, - Name = _branches[i].Name, + Name = itemName, Status = status, Result = branchResult, Error = branchError diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs index 95d9cef40..7c8c109fa 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -302,6 +302,28 @@ public void CompletionConfig_ToleratedFailurePercentage_OutOfRange_Throws() config.ToleratedFailurePercentage = null; } + [Fact] + public void CompletionConfig_MinSuccessful_OutOfRange_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.MinSuccessful = 0); + Assert.Throws(() => config.MinSuccessful = -1); + // 1 is the minimum meaningful value; null clears the criterion. + config.MinSuccessful = 1; + config.MinSuccessful = null; + } + + [Fact] + public void CompletionConfig_ToleratedFailureCount_Negative_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.ToleratedFailureCount = -1); + // zero (fail-fast) and positive counts are valid; null clears the criterion. + config.ToleratedFailureCount = 0; + config.ToleratedFailureCount = 5; + config.ToleratedFailureCount = null; + } + // ────────────────────────────────────────────────────────────────────── // CompletionConfig — first-successful short-circuit // ────────────────────────────────────────────────────────────────────── @@ -1034,4 +1056,108 @@ public async Task ParallelAsync_ReplayMixedStatus_PreservesStartedShortCircuited Assert.Empty(recorder.Flushed); } + [Fact] + public async Task ParallelAsync_ReplayUsesCheckpointedBranchName_NotCurrentName() + { + // The checkpointed name is authoritative on replay. Even when a branch + // has no per-branch checkpoint (STARTED / never dispatched), the name + // from the parent summary must flow through to the reconstructed item. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Branches":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"}, + {"Index":1,"Name":"beta","Status":"STARTED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + var result = await context.ParallelAsync( + new[] + { + new DurableBranch("alpha", async (_) => { await Task.Yield(); return 999; }), + new DurableBranch("beta", async (_) => { await Task.Yield(); return 999; }), + }, + name: "fanout"); + + Assert.Equal("alpha", result.All[0].Name); + Assert.Equal("beta", result.All[1].Name); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + } + + [Fact] + public async Task ParallelAsync_ReplayWithDriftedBranchName_ThrowsNonDeterministic() + { + // A branch name that differs between the checkpoint and the current + // code indicates the branch set was reordered/renamed between + // deployments — surface it rather than silently reconstructing. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Branches":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new[] + { + // Renamed from "alpha" → "renamed" since the checkpoint. + new DurableBranch("renamed", async (_) => { await Task.Yield(); return 999; }), + }, + name: "fanout")); + } + } From b566c41f2a6cc9cd3255e7058b8d6347b38c8975 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 13:37:25 -0400 Subject: [PATCH 05/21] Add MapAsync for concurrent collection processing Implements IDurableContext.MapAsync, processing a collection in parallel with one child context per item. Mirrors the Python/JS/Java SDKs, where Map is a sibling of Parallel sharing one concurrency engine. - Extract ConcurrentOperation base holding all orchestration, completion, checkpoint, and replay logic; ParallelOperation and MapOperation are thin subclasses supplying only the per-unit (name, func), sub-type labels, and failure-exception factory. - MapConfig defaults CompletionConfig to AllCompleted() (permissive), matching Python/Java Map; intentionally differs from ParallelConfig's AllSuccessful(). Adds ItemNamer; no ItemBatcher (not implemented in any reference SDK). - New MapException so callers can distinguish Map from Parallel failures. - Generalize ParallelSummary/ParallelJsonContext into shared BatchSummary/ BatchJsonContext. - Tests: 24 unit tests (MapOperationTests) + 6 integration functions/tests mirroring the Parallel set. Full suite 325/325 on net8.0 and net10.0. --- Docs/durable-execution-design.md | 35 +- .../DurableContext.cs | 35 + .../DurableExecutionException.cs | 33 + .../IDurableContext.cs | 29 + .../Internal/BatchJsonContext.cs | 15 + .../Internal/BatchSummary.cs | 33 + .../Internal/ConcurrentOperation.cs | 702 ++++++++++++++++++ .../Internal/MapOperation.cs | 75 ++ .../Internal/ParallelJsonContext.cs | 15 - .../Internal/ParallelOperation.cs | 639 +--------------- .../Internal/ParallelSummary.cs | 31 - .../MapConfig.cs | 75 ++ .../Operation.cs | 6 + .../MapFailureToleranceTest.cs | 69 ++ .../MapFirstSuccessfulTest.cs | 70 ++ .../MapHappyPathTest.cs | 75 ++ .../MapMaxConcurrencyTest.cs | 69 ++ .../MapPartialFailureTest.cs | 75 ++ .../MapReplayDeterminismTest.cs | 114 +++ .../MapFailureToleranceFunction/Dockerfile | 7 + .../MapFailureToleranceFunction/Function.cs | 55 ++ .../MapFailureToleranceFunction.csproj | 18 + .../MapFirstSuccessfulFunction/Dockerfile | 7 + .../MapFirstSuccessfulFunction/Function.cs | 63 ++ .../MapFirstSuccessfulFunction.csproj | 18 + .../MapHappyPathFunction/Dockerfile | 7 + .../MapHappyPathFunction/Function.cs | 45 ++ .../MapHappyPathFunction.csproj | 18 + .../MapMaxConcurrencyFunction/Dockerfile | 7 + .../MapMaxConcurrencyFunction/Function.cs | 61 ++ .../MapMaxConcurrencyFunction.csproj | 18 + .../MapPartialFailureFunction/Dockerfile | 7 + .../MapPartialFailureFunction/Function.cs | 63 ++ .../MapPartialFailureFunction.csproj | 18 + .../MapReplayDeterminismFunction/Dockerfile | 7 + .../MapReplayDeterminismFunction/Function.cs | 53 ++ .../MapReplayDeterminismFunction.csproj | 18 + .../MapOperationTests.cs | 688 +++++++++++++++++ .../ParallelOperationTests.cs | 10 +- MAP-IMPLEMENTATION-PLAN.md | 234 ++++++ 40 files changed, 2922 insertions(+), 695 deletions(-) create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs delete mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs delete mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs create mode 100644 MAP-IMPLEMENTATION-PLAN.md diff --git a/Docs/durable-execution-design.md b/Docs/durable-execution-design.md index 33d4092a5..5fc728c98 100644 --- a/Docs/durable-execution-design.md +++ b/Docs/durable-execution-design.md @@ -1357,22 +1357,21 @@ public class MapConfig public int? MaxConcurrency { get; set; } /// - /// When to consider the operation complete. + /// When to consider the operation complete. Defaults to AllCompleted() — + /// every item runs regardless of per-item failures, which surface via + /// IBatchResult<T>.Failed rather than throwing. This permissive default + /// matches the Python and Java SDKs' map operation. It differs intentionally + /// from ParallelConfig.CompletionConfig, which defaults to AllSuccessful() + /// (fail-fast). For fail-fast map behavior, set this to + /// CompletionConfig.AllSuccessful() or call IBatchResult<T>.ThrowIfError(). /// - public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted(); /// /// How item branches are represented in the checkpoint graph. /// public NestingType NestingType { get; set; } = NestingType.Nested; - /// - /// Optional batching configuration for grouping items before processing. - /// When set, items are grouped into batches and each batch is processed as a unit. - /// Reduces checkpoint overhead for large collections. - /// - public ItemBatcher? Batcher { get; set; } - /// /// Optional function to generate a custom name for each item's branch. /// Improves observability in execution traces. Receives the item and its index. @@ -1381,23 +1380,6 @@ public class MapConfig public Func? ItemNamer { get; set; } } -/// -/// Groups items into batches for map operations to reduce checkpoint overhead. -/// At least one of MaxItemsPerBatch or MaxBytesPerBatch must be set. -/// -public class ItemBatcher -{ - /// - /// Maximum number of items per batch. Null = no count limit. - /// - public int? MaxItemsPerBatch { get; set; } - - /// - /// Maximum serialized size (bytes) per batch. Null = no size limit. - /// - public int? MaxBytesPerBatch { get; set; } -} - /// /// Defines completion criteria for parallel/map operations. /// @@ -2129,7 +2111,6 @@ All four SDKs expose the same core operations. The differences are naming conven | Jitter strategy | `JitterStrategy` enum on `Exponential()` | `jitter_strategy` on `RetryStrategyConfig` | `jitter` on `createRetryStrategy()` | | Retry presets | `RetryStrategy.None/Default/Transient` | `RetryPresets.none()/default()/transient()` | `retryPresets.default/linear/noRetry` | | Nesting type | `NestingType` on `ParallelConfig`/`MapConfig` | `NestingType` on parallel/map config | `NestingType` on parallel/map config | -| Item batching | `ItemBatcher` on `MapConfig` | `ItemBatcher` on `MapConfig` | *(checkpoint manager handles batching)* | | Item namer | `ItemNamer` on `MapConfig` | Item naming function on `MapConfig` | `itemNamer` on `MapConfig` | | Error mapping | `ErrorMapping` on `ChildContextConfig` | *(typed exception wrapping)* | `errorMapping` on child context config | | Message-based retry filter | `retryableMessagePatterns` (regex) | `retryable_errors` (regex) | `retryableErrors` (RegExp[]) | diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs index 8f360d02a..6a271e670 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs @@ -239,6 +239,41 @@ private Task> RunParallel( return op.ExecuteAsync(cancellationToken); } + public Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default) + => RunMap(items, func, name, config, cancellationToken); + + private Task> RunMap( + IReadOnlyList items, + Func, Task> func, + string? name, + MapConfig? config, + CancellationToken cancellationToken) + { + if (items == null) throw new ArgumentNullException(nameof(items)); + if (func == null) throw new ArgumentNullException(nameof(func)); + + var effectiveConfig = config ?? new MapConfig(); + if (effectiveConfig.NestingType == NestingType.Flat) + { + throw new NotSupportedException( + "NestingType.Flat is not yet supported in the .NET Durable Execution SDK. " + + "Use NestingType.Nested (the default) for now."); + } + + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new Internal.MapOperation( + operationId, name, _idGenerator.ParentId, items, func, effectiveConfig, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + public Task WaitForCallbackAsync( Func submitter, string? name = null, diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs index 1b65c86b3..e4748b381 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs @@ -131,3 +131,36 @@ public ParallelException(string message) : base(message) { } /// Creates a wrapping an inner exception. public ParallelException(string message, Exception innerException) : base(message, innerException) { } } + +/// +/// Thrown when a map operation resolves with +/// . The aggregate +/// is preserved on so callers +/// can inspect per-item outcomes. +/// +/// +/// This is the base type for map failures. Subclasses may be added in future +/// releases; catching remains forward-compatible. +/// A dedicated type (rather than reusing ) lets +/// callers pattern-match which concurrent operation failed. +/// +public class MapException : DurableExecutionException +{ + /// + /// The aggregate result of the map operation. Type-erased — cast to + /// IBatchResult<T> if the per-item result type is known. + /// + public IBatchResult? Result { get; init; } + + /// + /// Why the map operation resolved. + /// + public CompletionReason CompletionReason { get; init; } + + /// Creates an empty . + public MapException() { } + /// Creates a with the given message. + public MapException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public MapException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs index bf4916fd9..a031120fd 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs @@ -217,6 +217,35 @@ Task> ParallelAsync( string? name = null, ParallelConfig? config = null, CancellationToken cancellationToken = default); + + /// + /// Process a collection of items concurrently, running + /// once per item. Each item runs inside its own child context; per-item + /// results are aggregated into an . Items + /// are dispatched up to ; the aggregate + /// resolves according to . + /// + /// + /// The per-item function receives the durable context, the item, its + /// zero-based index, and the full source list (matching the Python and + /// JavaScript SDKs). On per-item failure (the user function throws), the + /// failure is captured on the corresponding + /// instead of aborting the map. By default + /// () every item runs and failures + /// surface via ; the map throws + /// only when + /// criteria are violated. Use + /// for explicit + /// strict-success semantics. Per-item results are serialized to checkpoints + /// using the registered on + /// . + /// + Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); } /// diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs new file mode 100644 index 000000000..d2bfeb32f --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// AOT-friendly for the internal +/// payload stored on a concurrent operation's parent +/// CONTEXT checkpoint (parallel or map). Only this internal type — never user T — +/// flows through here, so the source-generated metadata is sufficient. +/// +[JsonSerializable(typeof(BatchSummary))] +[JsonSerializable(typeof(BatchUnitSummary))] +internal sealed partial class BatchJsonContext : JsonSerializerContext +{ +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs new file mode 100644 index 000000000..1e58e9654 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs @@ -0,0 +1,33 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Internal payload shape stored on a concurrent operation's parent CONTEXT +/// checkpoint (as ContextDetails.Result) and reconstructed on replay. +/// Shared by both and +/// : carries the completion reason and +/// the per-unit index → status map so the can be +/// rebuilt without depending on user T shape — per-unit results live on the +/// children's own checkpoints. +/// +internal sealed class BatchSummary +{ + [JsonPropertyName("CompletionReason")] + public string? CompletionReason { get; set; } + + [JsonPropertyName("Units")] + public IList Units { get; set; } = new List(); +} + +internal sealed class BatchUnitSummary +{ + [JsonPropertyName("Index")] + public int Index { get; set; } + + [JsonPropertyName("Name")] + public string? Name { get; set; } + + [JsonPropertyName("Status")] + public string? Status { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs new file mode 100644 index 000000000..9c28dc6f6 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -0,0 +1,702 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using System.Text.Json; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Shared orchestration base for the concurrent durable operations +/// ( and ). +/// Runs N user-supplied units concurrently (each as a +/// ) under a shared +/// and concurrency limit, persisting the +/// aggregate result so subsequent invocations replay it without re-executing. +/// +/// +/// Subclasses supply only what differs between Parallel and Map — the unit count, +/// how to obtain a unit's (name, func), the parent/child sub-type labels, +/// and the failure-exception factory. All concurrency, completion, checkpoint, and +/// replay logic lives here. +/// +/// Fresh: no prior state → sync-flush parent CONTEXT START → +/// dispatch units respecting MaxConcurrency → wait for in-flight to +/// complete after CompletionConfig short-circuit → emit parent CONTEXT +/// SUCCEED with summary payload (). +/// SUCCEEDED: parent payload supplies the snapshot of per-unit +/// statuses + completion reason; per-unit results are deserialised from the +/// children's own CONTEXT checkpoints. +/// FAILED: same reconstruction; throws the subclass exception +/// carrying the rebuilt . +/// STARTED / PENDING: re-execute (children replay from their +/// own checkpoints). +/// +/// Per-unit errors do NOT abort the operation directly — the orchestrator catches +/// each unit's , records it as a failed +/// , and consults the +/// after every completion. Only when the completion config marks the run as +/// does it throw. +/// +internal abstract class ConcurrentOperation : DurableOperation> +{ + private readonly CompletionConfig _completionConfig; + private readonly int? _maxConcurrency; + + /// Serializer used to deserialize per-unit child results on replay. + protected readonly ILambdaSerializer Serializer; + + /// Factory used to build each unit's inner child context. + protected readonly Func ChildContextFactory; + + protected ConcurrentOperation( + string operationId, + string? name, + string? parentId, + CompletionConfig completionConfig, + int? maxConcurrency, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _completionConfig = completionConfig; + _maxConcurrency = maxConcurrency; + Serializer = serializer; + ChildContextFactory = childContextFactory; + } + + protected override string OperationType => OperationTypes.Context; + + // ── Subclass hooks ────────────────────────────────────────────────── + + /// The number of units (branches or items) to execute. + protected abstract int UnitCount { get; } + + /// Parent CONTEXT sub-type label (e.g. Parallel / Map). + protected abstract string ParentSubType { get; } + + /// Per-unit child-context sub-type label (e.g. ParallelBranch / MapItem). + protected abstract string ChildSubType { get; } + + /// Singular operation noun used in messages (e.g. "Parallel" / "Map"). + protected abstract string OperationNoun { get; } + + /// Plural unit noun used in messages (e.g. "branches" / "items"). + protected abstract string UnitNounPlural { get; } + + /// + /// Resolves the unit at into its display name and the + /// function to run inside the unit's child context. + /// + protected abstract (string? Name, Func> Func) GetUnit(int index); + + /// + /// Builds the subclass-specific exception thrown when the operation resolves + /// with . + /// + protected abstract DurableExecutionException CreateException(string message, IBatchResult result); + + // ── Orchestration ─────────────────────────────────────────────────── + + protected override async Task> StartAsync(CancellationToken cancellationToken) + { + // Sync-flush parent CONTEXT START. Mirrors ChildContextOperation: if a + // unit suspends (e.g., a Wait inside it), the service needs to know the + // parent existed. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = ParentSubType, + Name = Name + }, cancellationToken); + + return await ExecuteUnitsAsync(cancellationToken); + } + + protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + switch (existing.Status) + { + case OperationStatuses.Succeeded: + return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); + + case OperationStatuses.Failed: + // Reconstruct so the caller (and the exception's Result) sees the + // per-unit outcomes; then throw. + var failed = ReconstructFromCheckpoints(existing, throwOnFailure: false); + throw BuildException(failed); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Re-run: units replay from their own checkpoints. + return ExecuteUnitsAsync(cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"{OperationNoun} operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private async Task> ExecuteUnitsAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var unitCount = UnitCount; + var slots = new UnitOutcome[unitCount]; + var dispatched = new bool[unitCount]; + + var maxConcurrency = _maxConcurrency ?? unitCount; + // Optimisation: when MaxConcurrency >= unitCount, skip the semaphore + // entirely. Behaviour is identical, allocations are lower. (Also covers + // the empty-collection case, where unitCount == 0 and no unit runs.) + var semaphore = (maxConcurrency >= unitCount || unitCount == 0) + ? null + : new SemaphoreSlim(maxConcurrency, maxConcurrency); + + var minSuccessful = _completionConfig.MinSuccessful; + var toleratedFailureCount = _completionConfig.ToleratedFailureCount; + var toleratedFailurePercentage = _completionConfig.ToleratedFailurePercentage; + + var succeeded = 0; + var failed = 0; + + var inFlight = new List(unitCount); + + // Units run with the parent's token so cooperative cancellation still + // propagates into user code, but we must NOT abandon already-dispatched + // units while they're still writing checkpoints — that would diverge + // between the original run and replay. The dispatch loop and + // Task.WhenAll below therefore await every in-flight task even when + // cancellation fires; the semaphore is disposed only after those units + // have settled (success, failure, or cooperative OCE). + try + { + try + { + for (var i = 0; i < unitCount; i++) + { + // Volatile reads pair with the Interlocked.Increment writes + // in the onComplete callback. Reads are non-atomic across + // the two counters: at worst we observe slightly stale + // values and dispatch one extra unit before the next + // completion forces a re-check. That's acceptable — the + // post-loop ComputeCompletionReason is the source of truth. + var succSnap = Volatile.Read(ref succeeded); + var failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, unitCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + break; + } + + if (semaphore != null) + { + await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false); + // Re-check after acquiring: the wait may have unblocked + // because earlier units finished and short-circuited the + // operation. + succSnap = Volatile.Read(ref succeeded); + failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, unitCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + semaphore.Release(); + break; + } + } + + var index = i; + dispatched[index] = true; + inFlight.Add(RunUnitAsync(index, slots, semaphore, cancellationToken, + onComplete: outcome => + { + if (outcome.Status == BatchItemStatus.Succeeded) + Interlocked.Increment(ref succeeded); + else if (outcome.Status == BatchItemStatus.Failed) + Interlocked.Increment(ref failed); + })); + } + } + finally + { + // CRITICAL: wait for every dispatched unit — even on the + // exceptional path (parent-token cancellation mid-dispatch, or a + // synchronous throw out of the loop) — before the semaphore is + // disposed. Otherwise surviving units' Release() calls hit + // ObjectDisposedException, the tasks become unobserved, and they + // keep writing checkpoints out from under us. + // + // We deliberately DO NOT cancel already-running units when a + // short-circuit fires — orphan units that continue writing + // checkpoints would diverge between the original run and replay. + // Letting them finish guarantees determinism: all dispatched units + // end up Succeeded or Failed. Only un-dispatched units surface as + // Started. + if (inFlight.Count > 0) + { + try + { + await Task.WhenAll(inFlight).ConfigureAwait(false); + } + catch + { + // Swallow here — Task.WhenAll only surfaces the first + // exception, but every unit task is now in a terminal + // state and we want to inspect each one individually below + // to decide whether to surface a workflow-level error. The + // Task objects themselves still carry their exceptions, so + // this swallow does not orphan them. + } + } + } + } + finally + { + semaphore?.Dispose(); + } + + // Surface any workflow-level exception (e.g. NonDeterministicExecutionException) + // raised inside a unit. RunUnitAsync re-throws DurableExecutionException + // (other than ChildContextException which is captured into the slot) so the + // task faults with that exception. Take the first such failure: these are + // structural errors, not "unit failed gracefully" outcomes. + foreach (var t in inFlight) + { + if (t.IsFaulted && t.Exception is { } agg) + { + foreach (var inner in agg.InnerExceptions) + { + if (inner is DurableExecutionException dex && inner is not ChildContextException) + { + throw dex; + } + } + } + } + + // Re-throw any pending parent-token cancellation now that units have + // settled and the semaphore has been disposed cleanly. + cancellationToken.ThrowIfCancellationRequested(); + + // Build BatchItems for every unit in original order. + var items = new List>(unitCount); + for (var i = 0; i < unitCount; i++) + { + var (unitName, _) = GetUnit(i); + if (dispatched[i]) + { + var outcome = slots[i]; + items.Add(new BatchItem + { + Index = i, + Name = unitName, + Status = outcome.Status, + Result = outcome.Status == BatchItemStatus.Succeeded ? outcome.Result : default, + Error = outcome.Status == BatchItemStatus.Failed ? outcome.Error : null + }); + } + else + { + items.Add(new BatchItem + { + Index = i, + Name = unitName, + Status = BatchItemStatus.Started, + Result = default, + Error = null + }); + } + } + + var completionReason = ComputeCompletionReason(items, unitCount); + var result = new BatchResult(items, completionReason); + + var failureException = completionReason == CompletionReason.FailureToleranceExceeded + ? BuildException(result) + : null; + + await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); + + if (failureException != null) + { + throw failureException; + } + + return result; + } + + private async Task RunUnitAsync( + int index, + UnitOutcome[] slots, + SemaphoreSlim? semaphore, + CancellationToken cancellationToken, + Action onComplete) + { + try + { + var (unitName, unitFunc) = GetUnit(index); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); + + var childOp = new ChildContextOperation( + childOpId, + unitName, + OperationId, + unitFunc, + new ChildContextConfig { SubType = ChildSubType }, + Serializer, + ChildContextFactory, + State, + Termination, + DurableExecutionArn, + Batcher); + + try + { + var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); + slots[index] = new UnitOutcome { Status = BatchItemStatus.Succeeded, Result = result }; + } + catch (ChildContextException ex) + { + slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = ex }; + } + catch (DurableExecutionException) + { + // E.g. NonDeterministicExecutionException — these are not "unit + // failed gracefully" but workflow-level problems. Surface them: + // re-throw out of the operation without writing a slot (the + // orchestrator's outer flow handles it). + throw; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // Parent-token cancellation: per cross-cutting decision Q10, OCE + // escapes unwrapped. Don't write a slot — Task.WhenAll observes + // this and the orchestrator re-throws after settling. + throw; + } + catch (OperationCanceledException ex) + { + // Unit-internal cancellation that is NOT tied to the parent token + // (e.g. the unit's own CancellationTokenSource fired). Treat it as + // a normal per-unit failure rather than killing the operation as + // cancelled. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + catch (Exception ex) + { + // Wrap unexpected exceptions as ChildContextException — they're + // per-unit failures from the user's POV. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + + onComplete(slots[index]); + } + finally + { + // Defensive: with this structure the semaphore is only disposed after + // Task.WhenAll(inFlight) has settled, so this Release should always + // succeed. ObjectDisposedException would indicate a bug elsewhere, but + // we tolerate it here so the task doesn't fault with a noise exception + // that masks the real one. + try + { + semaphore?.Release(); + } + catch (ObjectDisposedException) + { + } + } + } + + private static bool ShouldStopDispatching( + int succeeded, + int failed, + int totalUnits, + int? minSuccessful, + int? toleratedFailureCount, + double? toleratedFailurePercentage) + { + // Min-successful: short-circuit the moment we have enough wins. + if (minSuccessful is { } min && succeeded >= min) + return true; + + // Failure thresholds short-circuit on too many losses. + if (toleratedFailureCount is { } tfc && failed > tfc) + return true; + + if (toleratedFailurePercentage is { } tfp && totalUnits > 0) + { + var ratio = (double)failed / totalUnits; + if (ratio > tfp) return true; + } + + return false; + } + + private CompletionReason ComputeCompletionReason(IReadOnlyList> items, int totalCount) + { + var failed = 0; + var succeeded = 0; + var started = 0; + + foreach (var item in items) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded++; break; + case BatchItemStatus.Failed: failed++; break; + case BatchItemStatus.Started: started++; break; + } + } + + // Failure tolerance: only short-circuit-by-failure when at least one + // failure threshold is explicitly set. The factory CompletionConfig.AllSuccessful() + // sets ToleratedFailureCount = 0 to opt into fail-fast; an "empty" + // CompletionConfig (all properties null) is permissive. + if (_completionConfig.ToleratedFailureCount is { } tfc && failed > tfc) + return CompletionReason.FailureToleranceExceeded; + + if (_completionConfig.ToleratedFailurePercentage is { } tfp && totalCount > 0) + { + var ratio = (double)failed / totalCount; + if (ratio > tfp) return CompletionReason.FailureToleranceExceeded; + } + + // Min-successful satisfied (and we didn't run all units): MinSuccessfulReached. + if (_completionConfig.MinSuccessful is { } min && succeeded >= min && started > 0) + { + return CompletionReason.MinSuccessfulReached; + } + + // Every dispatched unit finished one way or the other (or all-completed + // without any failure criteria). + return CompletionReason.AllCompleted; + } + + private DurableExecutionException BuildException(IBatchResult result) + { + var message = + $"{OperationNoun} operation failed: failure tolerance exceeded " + + $"({result.FailureCount} of {result.TotalCount} {UnitNounPlural} failed)."; + return CreateException(message, result); + } + + private async Task CheckpointParentResultAsync( + BatchResult result, + CompletionReason completionReason, + DurableExecutionException? failureException, + CancellationToken cancellationToken) + { + var summary = new BatchSummary + { + CompletionReason = SerializeCompletionReason(completionReason), + Units = new List(result.All.Count) + }; + for (var i = 0; i < result.All.Count; i++) + { + var item = result.All[i]; + summary.Units.Add(new BatchUnitSummary + { + Index = item.Index, + Name = item.Name, + Status = SerializeStatus(item.Status) + }); + } + + var payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); + var failed = failureException != null; + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = failed ? OperationAction.FAIL : OperationAction.SUCCEED, + SubType = ParentSubType, + Name = Name, + Payload = failed ? null : payload, + Error = failed ? BuildAggregateError(result, failureException!) : null + }, cancellationToken); + } + + private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwOnFailure) + { + var summary = ParseSummary(parent.ContextDetails?.Result); + + var items = new List>(UnitCount); + for (var i = 0; i < UnitCount; i++) + { + var (unitName, _) = GetUnit(i); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{i + 1}"); + var childOp = State.GetOperation(childOpId); + var summaryEntry = summary?.Units.FirstOrDefault(b => b.Index == i); + + BatchItemStatus status = summaryEntry != null + ? DeserializeStatus(summaryEntry.Status) + : InferStatusFromChildOp(childOp); + + // Prefer the name that was checkpointed at the moment the batch + // resolved. This is the only authoritative source for units reported + // as Started (no per-unit checkpoint exists to consult), and it lets + // us detect unit-name drift between deployments. + var checkpointedName = summaryEntry?.Name; + if (checkpointedName != null && unitName != null && checkpointedName != unitName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for {OperationNoun.ToLowerInvariant()} unit {i} of operation " + + $"'{Name ?? OperationId}': expected name '{unitName}' but found '{checkpointedName}' " + + $"from a previous invocation. Code must not change the order or name of concurrent " + + $"units between deployments."); + } + var resolvedName = checkpointedName ?? unitName; + + T? unitResult = default; + DurableExecutionException? unitError = null; + + if (status == BatchItemStatus.Succeeded && childOp?.ContextDetails?.Result != null) + { + unitResult = DeserializeResult(childOp.ContextDetails.Result); + } + else if (status == BatchItemStatus.Failed && childOp?.ContextDetails?.Error != null) + { + var err = childOp.ContextDetails.Error; + unitError = new ChildContextException(err.ErrorMessage ?? "Unit failed") + { + SubType = childOp.SubType ?? ChildSubType, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + + items.Add(new BatchItem + { + Index = i, + Name = resolvedName, + Status = status, + Result = unitResult, + Error = unitError + }); + } + + var completionReason = summary != null + ? DeserializeCompletionReason(summary.CompletionReason) + : ComputeCompletionReason(items, UnitCount); + + var result = new BatchResult(items, completionReason); + + if (throwOnFailure && completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildException(result); + } + + return result; + } + + private static BatchItemStatus InferStatusFromChildOp(Operation? childOp) + { + if (childOp == null) return BatchItemStatus.Started; + return childOp.Status switch + { + OperationStatuses.Succeeded => BatchItemStatus.Succeeded, + OperationStatuses.Failed => BatchItemStatus.Failed, + _ => BatchItemStatus.Started + }; + } + + private SdkErrorObject BuildAggregateError(IBatchResult result, DurableExecutionException failureException) + { + return new SdkErrorObject + { + ErrorType = failureException.GetType().FullName, + ErrorMessage = + $"{OperationNoun} operation failed: {result.FailureCount} of {result.TotalCount} {UnitNounPlural} failed." + }; + } + + private static BatchSummary? ParseSummary(string? payload) + { + if (string.IsNullOrEmpty(payload)) return null; + try + { + return JsonSerializer.Deserialize(payload, BatchJsonContext.Default.BatchSummary); + } + catch (JsonException) + { + // Tolerate older / corrupted payloads — fall back to inferring status + // from per-unit checkpoints. + return null; + } + } + + private static string SerializeStatus(BatchItemStatus status) => status switch + { + BatchItemStatus.Succeeded => "SUCCEEDED", + BatchItemStatus.Failed => "FAILED", + BatchItemStatus.Started => "STARTED", + _ => throw new ArgumentOutOfRangeException(nameof(status)) + }; + + private static BatchItemStatus DeserializeStatus(string? wire) => wire switch + { + "SUCCEEDED" => BatchItemStatus.Succeeded, + "FAILED" => BatchItemStatus.Failed, + "STARTED" => BatchItemStatus.Started, + _ => BatchItemStatus.Started + }; + + private static string SerializeCompletionReason(CompletionReason reason) => reason switch + { + CompletionReason.AllCompleted => "ALL_COMPLETED", + CompletionReason.MinSuccessfulReached => "MIN_SUCCESSFUL_REACHED", + CompletionReason.FailureToleranceExceeded => "FAILURE_TOLERANCE_EXCEEDED", + _ => throw new ArgumentOutOfRangeException(nameof(reason)) + }; + + private static CompletionReason DeserializeCompletionReason(string? wire) => wire switch + { + "ALL_COMPLETED" => CompletionReason.AllCompleted, + "MIN_SUCCESSFUL_REACHED" => CompletionReason.MinSuccessfulReached, + "FAILURE_TOLERANCE_EXCEEDED" => CompletionReason.FailureToleranceExceeded, + _ => CompletionReason.AllCompleted + }; + + private T DeserializeResult(string serialized) + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return Serializer.Deserialize(ms); + } + + /// + /// Internal scratch space tracking each unit's outcome as it lands in the + /// executor; copied into the user-facing once every + /// dispatched unit has settled. + /// + private struct UnitOutcome + { + public BatchItemStatus Status; + public T? Result; + public DurableExecutionException? Error; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs new file mode 100644 index 000000000..14df87c15 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs @@ -0,0 +1,75 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Globalization; +using Amazon.Lambda; +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable map operation. Processes a collection in parallel, running the +/// user-supplied function once per item — each as a +/// . All orchestration, completion, +/// checkpoint, and replay logic lives in ; +/// this subclass supplies only the map-specific bits: how to turn an item index +/// into a (name, func) pair (the per-item callback receives the item, its +/// index, and the full source list), the Map sub-type labels, and the +/// factory. +/// +internal sealed class MapOperation : ConcurrentOperation +{ + private readonly IReadOnlyList _items; + private readonly Func, Task> _func; + private readonly Func? _itemNamer; + + public MapOperation( + string operationId, + string? name, + string? parentId, + IReadOnlyList items, + Func, Task> func, + MapConfig config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, + serializer, childContextFactory, state, termination, durableExecutionArn, batcher) + { + _items = items; + _func = func; + _itemNamer = config.ItemNamer; + } + + protected override int UnitCount => _items.Count; + protected override string ParentSubType => OperationSubTypes.Map; + protected override string ChildSubType => OperationSubTypes.MapItem; + protected override string OperationNoun => "Map"; + protected override string UnitNounPlural => "items"; + + protected override (string? Name, Func> Func) GetUnit(int index) + { + var item = _items[index]; + // Default name is the index — matches the unnamed-branch convention in + // ParallelAsync. A custom ItemNamer can derive a readable name from the + // item's content. Naming affects observability only, never replay + // correlation (child operation IDs are derived from the index). + var name = _itemNamer is not null + ? _itemNamer(item!, index) + : index.ToString(CultureInfo.InvariantCulture); + + return (name, ctx => _func(ctx, item, index, _items)); + } + + protected override DurableExecutionException CreateException(string message, IBatchResult result) + { + return new MapException(message) + { + Result = result, + CompletionReason = result.CompletionReason + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs deleted file mode 100644 index 9b830a59a..000000000 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelJsonContext.cs +++ /dev/null @@ -1,15 +0,0 @@ -using System.Text.Json.Serialization; - -namespace Amazon.Lambda.DurableExecution.Internal; - -/// -/// AOT-friendly for the internal -/// payload stored on a parallel parent's CONTEXT -/// checkpoint. Only this internal type — never user T — flows through here, so -/// the source-generated metadata is sufficient. -/// -[JsonSerializable(typeof(ParallelSummary))] -[JsonSerializable(typeof(ParallelBranchSummary))] -internal sealed partial class ParallelJsonContext : JsonSerializerContext -{ -} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs index 05511cf62..8eff97668 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -1,49 +1,22 @@ -using System.IO; -using System.Text; -using System.Text.Json; +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + using Amazon.Lambda; using Amazon.Lambda.Core; -using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; -using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; namespace Amazon.Lambda.DurableExecution.Internal; /// -/// Durable parallel operation. Runs N user-supplied branches concurrently -/// (each as a ) under a shared -/// and concurrency limit, persisting the -/// aggregate result so subsequent invocations replay it without re-executing. +/// Durable parallel operation. Runs N user-supplied branches concurrently, +/// each as a . All orchestration, +/// completion, checkpoint, and replay logic lives in +/// ; this subclass supplies only the +/// branch-specific bits (unit count, per-branch (name, func), sub-type +/// labels, and the failure-exception factory). /// -/// -/// Replay branches — example: await ctx.ParallelAsync(funcs, name: "fetch") -/// -/// Fresh: no prior state → sync-flush parent CONTEXT START → -/// dispatch branches respecting MaxConcurrency → wait for in-flight to -/// complete after CompletionConfig short-circuit → emit parent CONTEXT -/// SUCCEED with summary payload (). -/// SUCCEEDED: parent payload supplies the snapshot of per- -/// branch statuses + completion reason; per-branch results are -/// deserialised from the children's own CONTEXT checkpoints. -/// FAILED: same reconstruction; throws -/// carrying the rebuilt -/// . -/// STARTED / PENDING: re-execute (children replay from -/// their own checkpoints). -/// -/// Per-branch errors do NOT abort the parallel directly — the orchestrator -/// catches each branch's , records it as a -/// failed , and consults the -/// after every completion. Only when the -/// completion config marks the run as -/// does the parallel -/// throw. -/// -internal sealed class ParallelOperation : DurableOperation> +internal sealed class ParallelOperation : ConcurrentOperation { private readonly IReadOnlyList> _branches; - private readonly ParallelConfig _config; - private readonly ILambdaSerializer _serializer; - private readonly Func _childContextFactory; public ParallelOperation( string operationId, @@ -57,598 +30,30 @@ public ParallelOperation( TerminationManager termination, string durableExecutionArn, CheckpointBatcher? batcher = null) - : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, + serializer, childContextFactory, state, termination, durableExecutionArn, batcher) { _branches = branches; - _config = config; - _serializer = serializer; - _childContextFactory = childContextFactory; - } - - protected override string OperationType => OperationTypes.Context; - - protected override async Task> StartAsync(CancellationToken cancellationToken) - { - // Sync-flush parent CONTEXT START. Mirrors ChildContextOperation: if a - // branch suspends (e.g., a Wait inside a branch), the service needs to - // know the parallel parent existed. - await EnqueueAsync(new SdkOperationUpdate - { - Id = OperationId, - Type = OperationTypes.Context, - Action = OperationAction.START, - SubType = OperationSubTypes.Parallel, - Name = Name - }, cancellationToken); - - return await ExecuteBranchesAsync(cancellationToken); - } - - protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) - { - switch (existing.Status) - { - case OperationStatuses.Succeeded: - return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); - - case OperationStatuses.Failed: - // Reconstruct so the caller (and ParallelException.Result) sees - // the per-branch outcomes; then throw. - var failed = ReconstructFromCheckpoints(existing, throwOnFailure: false); - throw BuildParallelException(failed); - - case OperationStatuses.Started: - case OperationStatuses.Pending: - // Re-run: branches replay from their own checkpoints. - return ExecuteBranchesAsync(cancellationToken); - - default: - throw new NonDeterministicExecutionException( - $"Parallel operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); - } - } - - private async Task> ExecuteBranchesAsync(CancellationToken cancellationToken) - { - cancellationToken.ThrowIfCancellationRequested(); - - var branchCount = _branches.Count; - var slots = new BranchOutcome[branchCount]; - var dispatched = new bool[branchCount]; - - var maxConcurrency = _config.MaxConcurrency ?? branchCount; - // Optimisation: when MaxConcurrency >= branchCount, skip the semaphore - // entirely. Behaviour is identical, allocations are lower. - var semaphore = (maxConcurrency >= branchCount) ? null : new SemaphoreSlim(maxConcurrency, maxConcurrency); - - var minSuccessful = _config.CompletionConfig.MinSuccessful; - var toleratedFailureCount = _config.CompletionConfig.ToleratedFailureCount; - var toleratedFailurePercentage = _config.CompletionConfig.ToleratedFailurePercentage; - - var succeeded = 0; - var failed = 0; - - var inFlight = new List(branchCount); - - // Branches run with the parent's token so cooperative cancellation - // still propagates into user code, but we must NOT abandon already- - // dispatched branches while they're still writing checkpoints — that - // would diverge between the original run and replay. The dispatch - // loop and Task.WhenAll below therefore await every in-flight task - // even when cancellation fires; the semaphore is disposed only after - // those branches have settled (success, failure, or cooperative OCE). - try - { - try - { - for (var i = 0; i < branchCount; i++) - { - // Volatile reads pair with the Interlocked.Increment writes - // in the onComplete callback. Reads are non-atomic across - // the two counters: at worst we observe slightly stale - // values and dispatch one extra branch before the next - // completion forces a re-check. That's acceptable — the - // post-loop ComputeCompletionReason is the source of truth. - var succSnap = Volatile.Read(ref succeeded); - var failSnap = Volatile.Read(ref failed); - if (ShouldStopDispatching(succSnap, failSnap, branchCount, - minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) - { - break; - } - - if (semaphore != null) - { - await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false); - // Re-check after acquiring: the wait may have unblocked - // because earlier branches finished and short-circuited - // the operation. - succSnap = Volatile.Read(ref succeeded); - failSnap = Volatile.Read(ref failed); - if (ShouldStopDispatching(succSnap, failSnap, branchCount, - minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) - { - semaphore.Release(); - break; - } - } - - var index = i; - dispatched[index] = true; - inFlight.Add(RunBranchAsync(index, slots, semaphore, cancellationToken, - onComplete: outcome => - { - if (outcome.Status == BatchItemStatus.Succeeded) - Interlocked.Increment(ref succeeded); - else if (outcome.Status == BatchItemStatus.Failed) - Interlocked.Increment(ref failed); - })); - } - } - finally - { - // CRITICAL: wait for every dispatched branch — even on the - // exceptional path (parent-token cancellation mid-dispatch, or - // a synchronous throw out of the loop) — before the semaphore - // is disposed. Otherwise surviving branches' Release() calls - // hit ObjectDisposedException, the tasks become unobserved, - // and they keep writing checkpoints out from under us. - // - // We deliberately DO NOT cancel already-running branches when - // a short-circuit fires — orphan branches that continue - // writing checkpoints would diverge between the original run - // and replay. Letting them finish guarantees determinism: all - // dispatched branches end up Succeeded or Failed. Only - // un-dispatched branches surface as Started. - if (inFlight.Count > 0) - { - try - { - await Task.WhenAll(inFlight).ConfigureAwait(false); - } - catch - { - // Swallow here — Task.WhenAll only surfaces the first - // exception, but every branch task is now in a terminal - // state and we want to inspect each one individually - // below to decide whether to surface a workflow-level - // error. The Task objects themselves still carry their - // exceptions, so this swallow does not orphan them. - } - } - } - } - finally - { - semaphore?.Dispose(); - } - - // Surface any workflow-level exception (e.g. NonDeterministicExecutionException) - // raised inside a branch. RunBranchAsync re-throws DurableExecutionException - // (other than ChildContextException which is captured into the slot) so the - // task faults with that exception. Take the first such failure: these are - // structural errors, not "branch failed gracefully" outcomes. - foreach (var t in inFlight) - { - if (t.IsFaulted && t.Exception is { } agg) - { - foreach (var inner in agg.InnerExceptions) - { - if (inner is DurableExecutionException dex && inner is not ChildContextException) - { - throw dex; - } - } - } - } - - // Re-throw any pending parent-token cancellation now that branches - // have settled and the semaphore has been disposed cleanly. - cancellationToken.ThrowIfCancellationRequested(); - - // Build BatchItems for every branch in original order. - var items = new List>(branchCount); - for (var i = 0; i < branchCount; i++) - { - if (dispatched[i]) - { - var outcome = slots[i]; - items.Add(new BatchItem - { - Index = i, - Name = _branches[i].Name, - Status = outcome.Status, - Result = outcome.Status == BatchItemStatus.Succeeded ? outcome.Result : default, - Error = outcome.Status == BatchItemStatus.Failed ? outcome.Error : null - }); - } - else - { - items.Add(new BatchItem - { - Index = i, - Name = _branches[i].Name, - Status = BatchItemStatus.Started, - Result = default, - Error = null - }); - } - } - - var completionReason = ComputeCompletionReason(items, branchCount); - var result = new BatchResult(items, completionReason); - - await CheckpointParentResultAsync(result, completionReason, cancellationToken); - - if (completionReason == CompletionReason.FailureToleranceExceeded) - { - throw BuildParallelException(result); - } - - return result; - } - - private async Task RunBranchAsync( - int index, - BranchOutcome[] slots, - SemaphoreSlim? semaphore, - CancellationToken cancellationToken, - Action onComplete) - { - try - { - var branch = _branches[index]; - var branchOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); - - var childOp = new ChildContextOperation( - branchOpId, - branch.Name, - OperationId, - branch.Func, - new ChildContextConfig { SubType = OperationSubTypes.ParallelBranch }, - _serializer, - _childContextFactory, - State, - Termination, - DurableExecutionArn, - Batcher); - - try - { - var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); - slots[index] = new BranchOutcome { Status = BatchItemStatus.Succeeded, Result = result }; - } - catch (ChildContextException ex) - { - slots[index] = new BranchOutcome { Status = BatchItemStatus.Failed, Error = ex }; - } - catch (DurableExecutionException) - { - // E.g. NonDeterministicExecutionException — these are not - // "branch failed gracefully" but workflow-level problems. - // Surface them: re-throw out of the parallel without writing - // a slot (the orchestrator's outer flow handles it). - throw; - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - // Parent-token cancellation: per cross-cutting decision Q10, - // OCE escapes unwrapped. Don't write a slot — Task.WhenAll - // observes this and the orchestrator re-throws after settling. - throw; - } - catch (OperationCanceledException ex) - { - // Branch-internal cancellation that is NOT tied to the parent - // token (e.g. the branch's own CancellationTokenSource fired). - // Treat it as a normal per-branch failure rather than killing - // the parallel as cancelled. - var wrapped = new ChildContextException(ex.Message, ex) - { - SubType = OperationSubTypes.ParallelBranch, - ErrorType = ex.GetType().FullName - }; - slots[index] = new BranchOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; - } - catch (Exception ex) - { - // Wrap unexpected exceptions as ChildContextException — they're - // per-branch failures from the user's POV. - var wrapped = new ChildContextException(ex.Message, ex) - { - SubType = OperationSubTypes.ParallelBranch, - ErrorType = ex.GetType().FullName - }; - slots[index] = new BranchOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; - } - - onComplete(slots[index]); - } - finally - { - // Defensive: with the new structure the semaphore is only disposed - // after Task.WhenAll(inFlight) has settled, so this Release should - // always succeed. ObjectDisposedException would indicate a bug - // elsewhere, but we tolerate it here so the task doesn't fault - // with a noise exception that masks the real one. - try - { - semaphore?.Release(); - } - catch (ObjectDisposedException) - { - } - } - } - - private static bool ShouldStopDispatching( - int succeeded, - int failed, - int totalBranches, - int? minSuccessful, - int? toleratedFailureCount, - double? toleratedFailurePercentage) - { - // Min-successful: short-circuit the moment we have enough wins. - if (minSuccessful is { } min && succeeded >= min) - return true; - - // Failure thresholds short-circuit on too many losses. - if (toleratedFailureCount is { } tfc && failed > tfc) - return true; - - if (toleratedFailurePercentage is { } tfp && totalBranches > 0) - { - var ratio = (double)failed / totalBranches; - if (ratio > tfp) return true; - } - - return false; - } - - private CompletionReason ComputeCompletionReason(IReadOnlyList> items, int totalCount) - { - var failed = 0; - var succeeded = 0; - var started = 0; - - foreach (var item in items) - { - switch (item.Status) - { - case BatchItemStatus.Succeeded: succeeded++; break; - case BatchItemStatus.Failed: failed++; break; - case BatchItemStatus.Started: started++; break; - } - } - - // Failure tolerance: only short-circuit-by-failure when at least one - // failure threshold is explicitly set. The factory CompletionConfig.AllSuccessful() - // sets ToleratedFailureCount = 0 to opt into fail-fast; an "empty" - // CompletionConfig (all properties null) is permissive. - if (_config.CompletionConfig.ToleratedFailureCount is { } tfc && failed > tfc) - return CompletionReason.FailureToleranceExceeded; - - if (_config.CompletionConfig.ToleratedFailurePercentage is { } tfp && totalCount > 0) - { - var ratio = (double)failed / totalCount; - if (ratio > tfp) return CompletionReason.FailureToleranceExceeded; - } - - // Min-successful satisfied (and we didn't run all branches): MinSuccessfulReached. - if (_config.CompletionConfig.MinSuccessful is { } min && succeeded >= min && started > 0) - { - return CompletionReason.MinSuccessfulReached; - } - - // Every dispatched branch finished one way or the other (or all-completed - // without any failure criteria). - return CompletionReason.AllCompleted; - } - - private async Task CheckpointParentResultAsync( - BatchResult result, - CompletionReason completionReason, - CancellationToken cancellationToken) - { - var summary = new ParallelSummary - { - CompletionReason = SerializeCompletionReason(completionReason), - Branches = new List(result.All.Count) - }; - for (var i = 0; i < result.All.Count; i++) - { - var item = result.All[i]; - summary.Branches.Add(new ParallelBranchSummary - { - Index = item.Index, - Name = item.Name, - Status = SerializeStatus(item.Status) - }); - } - - var payload = JsonSerializer.Serialize(summary, ParallelJsonContext.Default.ParallelSummary); - var failed = completionReason == CompletionReason.FailureToleranceExceeded; - - await EnqueueAsync(new SdkOperationUpdate - { - Id = OperationId, - Type = OperationTypes.Context, - Action = failed ? OperationAction.FAIL : OperationAction.SUCCEED, - SubType = OperationSubTypes.Parallel, - Name = Name, - Payload = failed ? null : payload, - Error = failed ? BuildAggregateError(result) : null - }, cancellationToken); } - private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwOnFailure) - { - var summary = ParseSummary(parent.ContextDetails?.Result); - - var items = new List>(_branches.Count); - for (var i = 0; i < _branches.Count; i++) - { - var branchOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{i + 1}"); - var branchOp = State.GetOperation(branchOpId); - var summaryEntry = summary?.Branches.FirstOrDefault(b => b.Index == i); - - BatchItemStatus status = summaryEntry != null - ? DeserializeStatus(summaryEntry.Status) - : InferStatusFromBranchOp(branchOp); - - // Prefer the name that was checkpointed at the moment the batch - // resolved. This is the only authoritative source for branches - // reported as Started (no per-branch checkpoint exists to consult), - // and it lets us detect branch-name drift between deployments. - var currentName = _branches[i].Name; - var checkpointedName = summaryEntry?.Name; - if (checkpointedName != null && currentName != null && checkpointedName != currentName) - { - throw new NonDeterministicExecutionException( - $"Non-deterministic execution detected for parallel branch {i} of operation " + - $"'{Name ?? OperationId}': expected name '{currentName}' but found '{checkpointedName}' " + - $"from a previous invocation. Code must not change the order or name of parallel " + - $"branches between deployments."); - } - var itemName = checkpointedName ?? currentName; - - T? branchResult = default; - DurableExecutionException? branchError = null; - - if (status == BatchItemStatus.Succeeded && branchOp?.ContextDetails?.Result != null) - { - branchResult = DeserializeBranchResult(branchOp.ContextDetails.Result); - } - else if (status == BatchItemStatus.Failed && branchOp?.ContextDetails?.Error != null) - { - var err = branchOp.ContextDetails.Error; - branchError = new ChildContextException(err.ErrorMessage ?? "Branch failed") - { - SubType = branchOp.SubType ?? OperationSubTypes.ParallelBranch, - ErrorType = err.ErrorType, - ErrorData = err.ErrorData, - OriginalStackTrace = err.StackTrace - }; - } - - items.Add(new BatchItem - { - Index = i, - Name = itemName, - Status = status, - Result = branchResult, - Error = branchError - }); - } - - var completionReason = summary != null - ? DeserializeCompletionReason(summary.CompletionReason) - : ComputeCompletionReason(items, _branches.Count); - - var result = new BatchResult(items, completionReason); - - if (throwOnFailure && completionReason == CompletionReason.FailureToleranceExceeded) - { - throw BuildParallelException(result); - } - - return result; - } + protected override int UnitCount => _branches.Count; + protected override string ParentSubType => OperationSubTypes.Parallel; + protected override string ChildSubType => OperationSubTypes.ParallelBranch; + protected override string OperationNoun => "Parallel"; + protected override string UnitNounPlural => "branches"; - private static BatchItemStatus InferStatusFromBranchOp(Operation? branchOp) + protected override (string? Name, Func> Func) GetUnit(int index) { - if (branchOp == null) return BatchItemStatus.Started; - return branchOp.Status switch - { - OperationStatuses.Succeeded => BatchItemStatus.Succeeded, - OperationStatuses.Failed => BatchItemStatus.Failed, - _ => BatchItemStatus.Started - }; + var branch = _branches[index]; + return (branch.Name, branch.Func); } - private static ParallelException BuildParallelException(IBatchResult result) + protected override DurableExecutionException CreateException(string message, IBatchResult result) { - return new ParallelException( - $"Parallel operation failed: failure tolerance exceeded ({result.FailureCount} of {result.TotalCount} branches failed).") + return new ParallelException(message) { Result = result, CompletionReason = result.CompletionReason }; } - - private static SdkErrorObject BuildAggregateError(IBatchResult result) - { - return new SdkErrorObject - { - ErrorType = typeof(ParallelException).FullName, - ErrorMessage = $"Parallel operation failed: {result.FailureCount} of {result.TotalCount} branches failed." - }; - } - - private static ParallelSummary? ParseSummary(string? payload) - { - if (string.IsNullOrEmpty(payload)) return null; - try - { - return JsonSerializer.Deserialize(payload, ParallelJsonContext.Default.ParallelSummary); - } - catch (JsonException) - { - // Tolerate older / corrupted payloads — fall back to inferring status - // from per-branch checkpoints. - return null; - } - } - - private static string SerializeStatus(BatchItemStatus status) => status switch - { - BatchItemStatus.Succeeded => "SUCCEEDED", - BatchItemStatus.Failed => "FAILED", - BatchItemStatus.Started => "STARTED", - _ => throw new ArgumentOutOfRangeException(nameof(status)) - }; - - private static BatchItemStatus DeserializeStatus(string? wire) => wire switch - { - "SUCCEEDED" => BatchItemStatus.Succeeded, - "FAILED" => BatchItemStatus.Failed, - "STARTED" => BatchItemStatus.Started, - _ => BatchItemStatus.Started - }; - - private static string SerializeCompletionReason(CompletionReason reason) => reason switch - { - CompletionReason.AllCompleted => "ALL_COMPLETED", - CompletionReason.MinSuccessfulReached => "MIN_SUCCESSFUL_REACHED", - CompletionReason.FailureToleranceExceeded => "FAILURE_TOLERANCE_EXCEEDED", - _ => throw new ArgumentOutOfRangeException(nameof(reason)) - }; - - private static CompletionReason DeserializeCompletionReason(string? wire) => wire switch - { - "ALL_COMPLETED" => CompletionReason.AllCompleted, - "MIN_SUCCESSFUL_REACHED" => CompletionReason.MinSuccessfulReached, - "FAILURE_TOLERANCE_EXCEEDED" => CompletionReason.FailureToleranceExceeded, - _ => CompletionReason.AllCompleted - }; - - private T DeserializeBranchResult(string serialized) - { - var bytes = Encoding.UTF8.GetBytes(serialized); - using var ms = new MemoryStream(bytes); - return _serializer.Deserialize(ms); - } - - /// - /// Internal scratch space tracking each branch's outcome as it lands in - /// the executor; copied into the user-facing - /// once every dispatched branch has settled. - /// - private struct BranchOutcome - { - public BatchItemStatus Status; - public T? Result; - public DurableExecutionException? Error; - } } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs deleted file mode 100644 index 15b4e4f71..000000000 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelSummary.cs +++ /dev/null @@ -1,31 +0,0 @@ -using System.Text.Json.Serialization; - -namespace Amazon.Lambda.DurableExecution.Internal; - -/// -/// Internal payload shape stored on a parallel parent's CONTEXT checkpoint -/// (as ContextDetails.Result) and reconstructed on replay. Carries the -/// completion reason and the per-branch index → status map so the -/// can be rebuilt without depending on user T -/// shape — per-branch results live on the children's own checkpoints. -/// -internal sealed class ParallelSummary -{ - [JsonPropertyName("CompletionReason")] - public string? CompletionReason { get; set; } - - [JsonPropertyName("Branches")] - public IList Branches { get; set; } = new List(); -} - -internal sealed class ParallelBranchSummary -{ - [JsonPropertyName("Index")] - public int Index { get; set; } - - [JsonPropertyName("Name")] - public string? Name { get; set; } - - [JsonPropertyName("Status")] - public string? Status { get; set; } -} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs new file mode 100644 index 000000000..967e5d17c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs @@ -0,0 +1,75 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for +/// . +/// +/// +/// Per-item checkpoint payloads are serialized via the +/// registered on +/// (typically +/// configured via LambdaBootstrapBuilder.Create(handler, serializer)); +/// this config does not expose a serializer slot. +/// +public sealed class MapConfig +{ + private int? _maxConcurrency; + + /// + /// Maximum number of items processed concurrently. null (default) = + /// unlimited. Must be at least 1 when set. + /// + /// + /// Thrown by the setter if the value is less than or equal to 0. + /// + public int? MaxConcurrency + { + get => _maxConcurrency; + set + { + if (value is { } v && v <= 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MaxConcurrency must be at least 1, or null for unlimited."); + } + _maxConcurrency = value; + } + } + + /// + /// When the map operation is considered complete. Defaults to + /// — every item runs regardless + /// of per-item failures, which are surfaced via + /// rather than thrown. + /// + /// + /// This permissive default matches the Python and Java SDKs' map operation. + /// It differs intentionally from , + /// which defaults to (fail-fast). + /// For fail-fast map behavior — any item failure surfaces a + /// when the result is awaited — set this to + /// , or call + /// on the result. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted(); + + /// + /// How item branches are represented in the checkpoint graph. Defaults to + /// . + /// + /// + /// is not yet supported in the .NET SDK and + /// will throw when the map + /// operation is invoked. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; + + /// + /// Optional function to generate a custom name for each item's branch. + /// Receives the item and its zero-based index, and returns the branch name + /// surfaced in execution traces and on . + /// When null (default), branches are named by index ("0", + /// "1", ...), matching . + /// + public Func? ItemNamer { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs index c81be9f3f..ebe99ba27 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs @@ -201,6 +201,12 @@ public static class OperationSubTypes /// Parallel branch (per-branch child-context) sub-type. public const string ParallelBranch = "ParallelBranch"; + + /// Map parent sub-type. + public const string Map = "Map"; + + /// Map item (per-item child-context) sub-type. + public const string MapItem = "MapItem"; } /// diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs new file mode 100644 index 000000000..06ab716c0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFailureToleranceTest +{ + private readonly ITestOutputHelper _output; + public MapFailureToleranceTest(ITestOutputHelper output) => _output = output; + + /// + /// Five items, two fail, ToleratedFailureCount=1. The map must surface a + /// with reason + /// ; the workflow must + /// terminate FAILED. Validates the failure-tolerance short-circuit and that + /// MapException (not ParallelException) propagates as the + /// workflow's terminal error. + /// + [Fact] + public async Task Map_FailureToleranceExceeded_FailsWorkflow() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFailureToleranceFunction"), + "mtol", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m3"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller — locate the + // execution by name to inspect its terminal status. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + // MapException is the terminal error type the SDK throws when the + // failure-tolerance short-circuit fires. + var errorType = execution.Error.ErrorType ?? string.Empty; + var errorMessage = execution.Error.ErrorMessage ?? string.Empty; + Assert.True( + errorType.Contains("MapException", StringComparison.Ordinal) + || errorMessage.Contains("Map", StringComparison.OrdinalIgnoreCase), + $"Expected error to indicate MapException; got type='{errorType}' message='{errorMessage}'"); + + // History: parent CONTEXT and at least 2 failed item contexts visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.EventType == EventType.ContextFailed) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.True( + events.Count(e => e.EventType == EventType.ContextFailed) >= 2, + $"Expected >= 2 ContextFailed events; got {events.Count(e => e.EventType == EventType.ContextFailed)}"); + + // The parent context (named "tolerance") records the aggregate failure. + var parentFailed = events.FirstOrDefault(e => + e.EventType == EventType.ContextFailed && e.Name == "tolerance"); + Assert.NotNull(parentFailed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs new file mode 100644 index 000000000..737e70a2f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFirstSuccessfulTest +{ + private readonly ITestOutputHelper _output; + public MapFirstSuccessfulTest(ITestOutputHelper output) => _output = output; + + /// + /// Four items with staggered durable waits, FirstSuccessful: as soon + /// as one item completes, the map resolves. In-flight items remain in + /// rather than being cancelled. + /// Validates the cross-cutting decision: orphan units are NOT cancelled, and + /// short-circuit reports them as Started. + /// + [Fact] + public async Task Map_FirstSuccessful_ShortCircuitsOnFirstWin() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFirstSuccessfulFunction"), + "mfirst", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m4"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Wait timer = 8s, plus invocation overhead. Generous timeout for CI variance. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var winnerIndex = doc.RootElement.GetProperty("WinnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("WinnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("CompletionReason").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + + // At least one item succeeded — the workflow short-circuited as soon as + // the first win materialised. The fastest item is index 1 (1s wait). + Assert.True(successCount >= 1, $"Expected >= 1 successful item, got {successCount}"); + Assert.True(winnerIndex >= 0 && winnerIndex < 4, + $"WinnerIndex should be a valid item index, got {winnerIndex}"); + Assert.NotNull(winnerName); + Assert.NotEqual("FailureToleranceExceeded", completionReason); + + // Service-side: the parent CONTEXT and at least the winning item CONTEXT + // succeeded. Other items' final state is timing-dependent (the + // orchestrator does not cancel in-flight units on short-circuit). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded && e.Name == "race") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var parentSucceeded = events.FirstOrDefault(e => + e.EventType == EventType.ContextSucceeded && e.Name == "race"); + Assert.NotNull(parentSucceeded); + + // The winning item's CONTEXT SUCCEEDED is in the history. + Assert.Contains(events, e => e.EventType == EventType.ContextSucceeded && e.Name == winnerName); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs new file mode 100644 index 000000000..6ee451049 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs @@ -0,0 +1,75 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapHappyPathTest +{ + private readonly ITestOutputHelper _output; + public MapHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path map: three items each processed in a step, and the + /// workflow returns the joined results. Validates the parent CONTEXT and + /// per-item CONTEXT checkpoints all land in the service-side history with the + /// correct (ItemNamer-derived) names and ordering. + /// + [Fact] + public async Task Map_AllItemsSucceed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapHappyPathFunction"), + "mhappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m1"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The user-visible payload contains all three item outputs in index + // order (the SDK preserves index order even when items race). + Assert.Contains("order-1-m1", responsePayload); + Assert.Contains("order-2-m1", responsePayload); + Assert.Contains("order-3-m1", responsePayload); + + // History is eventually consistent — wait until the parent CONTEXT and + // all three item CONTEXT checkpoints are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 4, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Parent + 3 items = 4 ContextStarted, 4 ContextSucceeded. + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextSucceeded)); + + // The three items show up by their ItemNamer name on their own + // ContextStarted events. + var startedNames = events + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Name) + .ToList(); + Assert.Contains("process_all", startedNames); + Assert.Contains("item-order-1", startedNames); + Assert.Contains("item-order-2", startedNames); + Assert.Contains("item-order-3", startedNames); + + // Each item ran one step => 3 StepSucceeded. + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepSucceeded)); + + // No item failed. + Assert.Empty(events.Where(e => e.EventType == EventType.ContextFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs new file mode 100644 index 000000000..7c55418e7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapMaxConcurrencyTest +{ + private readonly ITestOutputHelper _output; + public MapMaxConcurrencyTest(ITestOutputHelper output) => _output = output; + + /// + /// 6 items, each with a 2-second durable wait, MaxConcurrency = 2. Validates + /// the semaphore actually throttles dispatch: timestamps must cluster into + /// waves rather than all six firing simultaneously. Timing tolerance is + /// intentionally generous to avoid CI flakiness; the load-bearing assertion + /// is "not all 6 ran at once". + /// + [Fact] + public async Task Map_MaxConcurrency_ThrottlesItemDispatch() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapMaxConcurrencyFunction"), + "mmaxc", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m5"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 waves x 2s waits + invocation overhead. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + Assert.Equal(6, successCount); + + var timestamps = doc.RootElement.GetProperty("Timestamps") + .EnumerateArray().Select(t => t.GetInt64()).ToList(); + Assert.Equal(6, timestamps.Count); + + var sorted = timestamps.OrderBy(t => t).ToList(); + var minTs = sorted[0]; + var relative = sorted.Select(t => t - minTs).ToList(); + _output.WriteLine($"Relative timestamps (ms): {string.Join(", ", relative)}"); + + // Tolerant clustering: with MaxConcurrency=2 and 2s waits, the first wave + // should hold ~2 items. Strict 3-wave clustering can be flaky under + // service jitter, so we assert the weaker (still meaningful) property: + // not all 6 items fired in the same wave. + var firstWave = relative.Where(r => r < 1500).Count(); + Assert.True(firstWave <= 3, + $"Expected MaxConcurrency=2 to limit the first wave to ~2 items; got {firstWave} within 1500ms of start. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + + // The full set must span at least one wave-gap (~2s) — proving items did + // NOT all run at once. + var total = sorted[^1] - sorted[0]; + Assert.True(total >= 1500, + $"Expected items to span >= 1500ms (proves throttling); got {total}ms. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs new file mode 100644 index 000000000..6a29c18df --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs @@ -0,0 +1,75 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapPartialFailureTest +{ + private readonly ITestOutputHelper _output; + public MapPartialFailureTest(ITestOutputHelper output) => _output = output; + + /// + /// Three items, one throws, two succeed — with NO config supplied. Map's + /// default CompletionConfig is AllCompleted() (permissive), + /// unlike Parallel's AllSuccessful(). This validates the headline + /// Map-vs-Parallel behavioral difference end-to-end: a partial failure does + /// NOT fail the workflow; it surfaces success/failure counts and per-item + /// errors through the service round-trip and back into the rebuilt + /// . + /// + [Fact] + public async Task Map_PartialFailure_DefaultIsPermissive_ReportsCounts() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapPartialFailureFunction"), + "mpartial", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m2"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + // Permissive default means partial failure is NOT a workflow failure — + // the workflow accepted the failure and returned a result. + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("FailureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("ErrorSummary").GetString(); + + Assert.Equal(2, successCount); + Assert.Equal(1, failureCount); + Assert.NotNull(errorSummary); + Assert.Contains("intentional partial failure", errorSummary); + + // History: 1 parent + 3 items = 4 ContextStarted; 3 ContextSucceeded + // (parent + 2 ok items); 1 ContextFailed (the boom item). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(3, events.Count(e => e.EventType == EventType.ContextSucceeded)); + Assert.Equal(1, events.Count(e => e.EventType == EventType.ContextFailed)); + + // The failing item's checkpoint preserves the exception message. Its + // branch name is the default index ("1", the middle item). + var failedEvent = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed); + Assert.NotNull(failedEvent); + Assert.Equal("1", failedEvent!.Name); + Assert.Contains("intentional partial failure", + failedEvent.ContextFailedDetails?.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs new file mode 100644 index 000000000..02b867958 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs @@ -0,0 +1,114 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public MapReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// Each item's operation ID must equal SHA-256(parentOpId + "-" + (index+1)) + /// (matching OperationIdGenerator's CreateChild contract). Reproduced locally + /// because OperationIdGenerator is internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// Three map items, each containing a step + a durable wait (the wait forces + /// a suspend/resume cycle so the map actually replays). Verifies: + /// 1. The item operation IDs match the deterministic + /// SHA256("<parentId>-<n>") formula (the same one used by + /// OperationIdGenerator.CreateChild and the reference Java/JS/Python SDKs). + /// 2. Each item's user-visible step result is preserved across replay (the + /// GUID generated inside generate survives suspend/resume). + /// + [Fact] + public async Task Map_ItemOperationIds_AreDeterministic_AcrossReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapReplayDeterminismFunction"), + "mreplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m6"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The map parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var expectedItemIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + + // Wait until each item's CONTEXT SUCCEEDED is visible AND each item's + // step/wait events are visible (they live under the item operation IDs). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 4) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Item operation IDs match the deterministic hash. + var itemStartedEvents = allEvents + .Where(e => e.EventType == EventType.ContextStarted && e.Id != null && e.Id != parentOpId) + .ToList(); + var observedItemIds = itemStartedEvents.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedItemIds.Count); + foreach (var expected in expectedItemIds) + { + Assert.Contains(expected, observedItemIds); + } + + // 2. Each item's CONTEXT succeeded (parent named "fanout" excluded). + var itemSucceededEvents = allEvents + .Where(e => e.EventType == EventType.ContextSucceeded && e.Name != "fanout") + .ToList(); + Assert.Equal(3, itemSucceededEvents.Count); + + // 3. Each item's "generate" step succeeded exactly once — proving replay + // returned the cached step result rather than re-executing. + var stepSucceededEvents = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, stepSucceededEvents.Count); + + // 4. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response contains the per-item step results + // (proving they survived replay). + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs new file mode 100644 index 000000000..62712b6a4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs @@ -0,0 +1,55 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Five items, two throw. ToleratedFailureCount = 1 means a second failure + // exceeds tolerance and the map surfaces a MapException — terminating the + // workflow FAILED. + var items = new[] { "ok1", "bad1", "ok2", "bad2", "ok3" }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.CompletedTask; + if (item.StartsWith("bad")) + throw new InvalidOperationException($"{item} boom"); + return item; + }, + name: "tolerance", + config: new MapConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Should not reach here — the map must throw MapException. + return new TestResult { Status = "should_not_reach", SuccessCount = batch.SuccessCount }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs new file mode 100644 index 000000000..d083a054b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Four items, each waits a different (durable) duration. The shortest + // wait should win and short-circuit the map via FirstSuccessful. Wait + // durations are at least 1s (service timer granularity). The item value + // IS the wait-seconds; the result is the item's index. + var waitSeconds = new[] { 8, 1, 5, 6 }; + + var batch = await context.MapAsync( + waitSeconds, + async (ctx, seconds, index, all) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(seconds), name: $"wait_{index}"); + return index; + }, + name: "race", + config: new MapConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + var winner = batch.Succeeded.FirstOrDefault(); + return new TestResult + { + Status = "completed", + WinnerIndex = winner?.Index ?? -1, + WinnerName = winner?.Name, + CompletionReason = batch.CompletionReason.ToString(), + SuccessCount = batch.SuccessCount, + StartedCount = batch.StartedCount + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int WinnerIndex { get; set; } + public string? WinnerName { get; set; } + public string? CompletionReason { get; set; } + public int SuccessCount { get; set; } + public int StartedCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs new file mode 100644 index 000000000..14da119f8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs @@ -0,0 +1,45 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var orders = new[] { "order-1", "order-2", "order-3" }; + + // Each item is processed inside a step so the per-item child context + // owns a leaf operation. ItemNamer gives each item a readable branch + // name in the service-side history. + var batch = await context.MapAsync( + orders, + async (ctx, orderId, index, all) => + await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return $"{orderId}-{input.OrderId}"; }, + name: "process"), + name: "process_all", + config: new MapConfig { ItemNamer = (item, index) => $"item-{item}" }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs new file mode 100644 index 000000000..0499a7a93 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // 6 items, MaxConcurrency = 2. Each item does a 2-second durable wait + // then captures the post-wait wall-clock as a unix-ms timestamp. The + // expected outcome is 3 waves of 2 items; total elapsed ~6s. Use + // IDurableContext.WaitAsync (not Task.Delay) — Task.Delay is NOT durable + // and would skew this measurement under replay. + var items = new[] { 0, 1, 2, 3, 4, 5 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: $"wait_{index}"); + return DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + }, + name: "throttled", + config: new MapConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + Timestamps = batch.GetResults().ToArray() + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public long[]? Timestamps { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs new file mode 100644 index 000000000..39676c3ed --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items, the middle one throws. Map's DEFAULT CompletionConfig is + // AllCompleted() (permissive) — unlike Parallel's AllSuccessful() — so NO + // config is supplied here and the map must still drive every item to a + // terminal state without throwing. This is the key Map-vs-Parallel + // behavioral difference, validated end-to-end. + var items = new[] { "ok1", "boom", "ok2" }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.CompletedTask; + if (item == "boom") + throw new InvalidOperationException("intentional partial failure"); + return item; + }, + name: "partial"); + + var errors = batch.GetErrors(); + var errorSummary = string.Join("|", errors.Select(e => $"{e.GetType().Name}:{e.Message}")); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + FailureCount = batch.FailureCount, + ErrorSummary = errorSummary + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public int FailureCount { get; set; } + public string? ErrorSummary { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..9a75cbd5e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs @@ -0,0 +1,53 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items. Each item generates a fresh GUID inside a step, then does + // a durable wait. The wait forces a suspend/resume cycle, so the second + // invocation MUST replay the cached GUID rather than re-running the step. + // If replay determinism is broken, the GUID would change between the + // original execution and replay. + var items = new[] { 0, 1, 2 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the map. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs new file mode 100644 index 000000000..e67345760 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs @@ -0,0 +1,688 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class MapOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ────────────────────────────────────────────────────────────────────── + // Public surface — basic happy paths + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_FreshExecution_AllItemsSucceed() + { + var (context, recorder, tm, _) = CreateContext(); + + var items = new[] { 10, 20, 30 }; + + var result = await context.MapAsync( + items, + async (ctx, item, index, all) => { await Task.Yield(); return item * 2; }, + name: "double_all"); + + Assert.False(tm.IsTerminated); + Assert.Equal(3, result.TotalCount); + Assert.Equal(3, result.SuccessCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(0, result.StartedCount); + Assert.False(result.HasFailure); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 20, 40, 60 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT START + 3 item CONTEXT STARTs + 3 item CONTEXT SUCCEEDs + Parent CONTEXT SUCCEED + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(8, contextActions.Length); + Assert.Equal("Map:START", contextActions[0]); + Assert.Equal("Map:SUCCEED", contextActions[^1]); + } + + [Fact] + public async Task MapAsync_PassesItemIndexAndFullList_ToCallback() + { + var (context, _, _, _) = CreateContext(); + + var items = new[] { "a", "b", "c" }; + + var result = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.Yield(); + // Confirm the callback sees the item, its index, and the whole list. + Assert.Same(items, all); + Assert.Equal(items[index], item); + return $"{index}:{item}:{all.Count}"; + }); + + Assert.Equal(new[] { "0:a:3", "1:b:3", "2:c:3" }, result.GetResults()); + } + + [Fact] + public async Task MapAsync_PreservesIndexOrder_EvenWhenItemsCompleteOutOfOrder() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 40, 10, 20 }, + async (ctx, delay, index, all) => { await Task.Delay(delay); return index + 1; }); + + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + for (var i = 0; i < result.All.Count; i++) + { + Assert.Equal(i, result.All[i].Index); + } + } + + [Fact] + public async Task MapAsync_ItemOperationIds_AreDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.MapAsync( + new[] { "a", "b" }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstItemId = ChildIdAt(parentOpId, 1); + var secondItemId = ChildIdAt(parentOpId, 2); + + var itemStarts = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "START") + .ToArray(); + Assert.Equal(2, itemStarts.Length); + Assert.Contains(itemStarts, o => o.Id == firstItemId); + Assert.Contains(itemStarts, o => o.Id == secondItemId); + } + + [Fact] + public async Task MapAsync_DefaultNaming_UsesIndexAsName() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + Assert.Equal("0", result.All[0].Name); + Assert.Equal("1", result.All[1].Name); + } + + [Fact] + public async Task MapAsync_ItemNamer_PropagatesNameToCheckpointAndItem() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { "order-1", "order-2" }, + async (ctx, item, index, all) => { await Task.Yield(); return item.Length; }, + name: "process_orders", + config: new MapConfig { ItemNamer = (item, index) => $"Order-{item}" }); + + Assert.Equal("Order-order-1", result.All[0].Name); + Assert.Equal("Order-order-2", result.All[1].Name); + + await recorder.Batcher.DrainAsync(); + + var itemSucceeds = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "SUCCEED") + .ToArray(); + Assert.Contains(itemSucceeds, o => o.Name == "Order-order-1"); + Assert.Contains(itemSucceeds, o => o.Name == "Order-order-2"); + } + + [Fact] + public async Task MapAsync_EmptyCollection_ReturnsEmptyResultWithAllCompleted() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + Array.Empty(), + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + Assert.Equal(0, result.TotalCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Even the empty case still flushes parent START + parent SUCCEED. + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(new[] { "Map:START", "Map:SUCCEED" }, contextActions); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — Map's permissive default vs fail-fast opt-in + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_AllCompletedDefault_PartialFailureDoesNotThrow() + { + // Map's default CompletionConfig is AllCompleted() (permissive), unlike + // Parallel's AllSuccessful(). A single item failure is captured rather + // than thrown. + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("oops"); + return item; + }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + + var errors = result.GetErrors(); + Assert.Single(errors); + Assert.Contains("oops", errors[0].Message); + } + + [Fact] + public async Task MapAsync_AllSuccessfulOptIn_OneFailureThrowsMapException() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("item boom"); + return item; + }, + config: new MapConfig { CompletionConfig = CompletionConfig.AllSuccessful() })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + Assert.Equal(2, typed.SuccessCount); + } + + [Fact] + public async Task MapAsync_ThrowIfError_ThrowsUnderPermissiveDefault() + { + // The permissive default does not auto-throw; ThrowIfError is the + // explicit strict-success check. + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("boom"); + return item; + }); + + Assert.True(result.HasFailure); + var thrown = Assert.ThrowsAny(() => result.ThrowIfError()); + Assert.Contains("boom", thrown.Message); + } + + [Fact] + public async Task MapAsync_ToleratedFailureCount_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item != 3) throw new InvalidOperationException($"fail-{item}"); + return item; + }, + config: new MapConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — first/min-successful short-circuit + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_FirstSuccessful_ResolvesAfterFirstSuccess() + { + var (context, _, _, _) = CreateContext(); + + // MaxConcurrency = 1 so dispatch order is deterministic: item 0 fires + // first and succeeds; items 1 and 2 are never dispatched and remain + // BatchItemStatus.Started. + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig + { + MaxConcurrency = 1, + CompletionConfig = CompletionConfig.FirstSuccessful() + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(1, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(3, result.TotalCount); + + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + } + + // ────────────────────────────────────────────────────────────────────── + // MaxConcurrency + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_MaxConcurrency_LimitsInFlight() + { + var (context, _, _, _) = CreateContext(); + + var inFlight = 0; + var maxObserved = 0; + var lockObj = new object(); + + var result = await context.MapAsync( + new[] { 1, 2, 3, 4, 5 }, + async (ctx, item, index, all) => + { + lock (lockObj) + { + inFlight++; + if (inFlight > maxObserved) maxObserved = inFlight; + } + await Task.Delay(20); + lock (lockObj) inFlight--; + return item; + }, + config: new MapConfig { MaxConcurrency = 2 }); + + Assert.Equal(5, result.SuccessCount); + Assert.True(maxObserved <= 2, $"Observed concurrency {maxObserved} exceeded MaxConcurrency = 2"); + } + + [Fact] + public async Task MapAsync_MaxConcurrencyAtLeastItemCount_RunsWithoutSemaphore() + { + // MaxConcurrency >= item count exercises the no-semaphore optimization + // path; behavior must be identical (all items still run). + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig { MaxConcurrency = 10 }); + + Assert.Equal(3, result.SuccessCount); + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + } + + [Fact] + public void MapConfig_MaxConcurrency_OutOfRange_Throws() + { + var config = new MapConfig(); + Assert.Throws(() => config.MaxConcurrency = 0); + Assert.Throws(() => config.MaxConcurrency = -1); + config.MaxConcurrency = 1; + config.MaxConcurrency = null; + } + + [Fact] + public void MapConfig_DefaultCompletionConfig_IsAllCompleted() + { + // Guards the intentional divergence from ParallelConfig (AllSuccessful). + var config = new MapConfig(); + // AllCompleted() == empty CompletionConfig (no failure thresholds). + Assert.Null(config.CompletionConfig.ToleratedFailureCount); + Assert.Null(config.CompletionConfig.MinSuccessful); + Assert.Null(config.CompletionConfig.ToleratedFailurePercentage); + } + + // ────────────────────────────────────────────────────────────────────── + // NestingType + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_NestingTypeFlat_ThrowsNotSupported() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig { NestingType = NestingType.Flat })); + } + + // ────────────────────────────────────────────────────────────────────── + // Argument validation + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_NullItems_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync( + null!, + async (ctx, item, index, all) => { await Task.Yield(); return item; })); + } + + [Fact] + public async Task MapAsync_NullFunc_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync(new[] { 1 }, (Func, Task>)null!)); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + var i1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "double_all", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails { Result = "100" } + }, + new() + { + Id = i1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "1", + ContextDetails = new ContextDetails { Result = "200" } + } + } + }); + + var calls = 0; + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { calls++; await Task.Yield(); return 999; }, + name: "double_all"); + + // Cached results returned without re-executing the callback. + Assert.Equal(0, calls); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task MapAsync_ReplayMixedStatus_PreservesStartedShortCircuited() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + var i1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails { Result = "10" } + }, + new() + { + Id = i1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "1", + ContextDetails = new ContextDetails { Result = "20" } + } + // Item 2 has no checkpoint at all — it was never dispatched. + } + }); + + var calls = 0; + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { calls++; await Task.Yield(); return 999; }, + name: "m"); + + Assert.Equal(0, calls); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.StartedCount); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task MapAsync_ReplayFailed_RebuildsResultAndThrows() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails + { + Error = new ErrorObject { ErrorMessage = "stored failure", ErrorType = "System.InvalidOperationException" } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return 999; }, + name: "m")); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + } + + [Fact] + public async Task MapAsync_ReplayWithDriftedItemName_ThrowsNonDeterministic() + { + // A checkpointed item name that differs from the current ItemNamer output + // indicates the item set was reordered/renamed between deployments. + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return 999; }, + name: "m", + // Namer now yields "renamed" instead of the checkpointed "alpha". + config: new MapConfig { ItemNamer = (item, index) => "renamed" })); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay determinism + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_TwoFreshRuns_ProduceIdenticalItemOperationIds() + { + // Item operation IDs are derived from the parent op ID + index, so two + // independent fresh runs of the same workflow shape must emit the same + // child IDs (the foundation of replay correctness). + string[] IdsFromRun() + { + var (context, recorder, _, _) = CreateContext(); + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }).GetAwaiter().GetResult(); + recorder.Batcher.DrainAsync().GetAwaiter().GetResult(); + return recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "START") + .Select(o => o.Id) + .OrderBy(id => id) + .ToArray(); + } + + var first = IdsFromRun(); + var second = IdsFromRun(); + + Assert.Equal(3, first.Length); + Assert.Equal(first, second); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs index 7c8c109fa..160a56e1f 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -464,7 +464,7 @@ public async Task ParallelAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() var b1 = ChildIdAt(parentOpId, 2); var summaryJson = """ - {"CompletionReason":"ALL_COMPLETED","Branches":[ + {"CompletionReason":"ALL_COMPLETED","Units":[ {"Index":0,"Name":"0","Status":"SUCCEEDED","OperationId":"placeholder0"}, {"Index":1,"Name":"1","Status":"SUCCEEDED","OperationId":"placeholder1"} ]} @@ -529,7 +529,7 @@ public async Task ParallelAsync_ReplayFailed_ThrowsParallelException() var b1 = ChildIdAt(parentOpId, 2); var summaryJson = """ - {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Branches":[ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ {"Index":0,"Name":"0","Status":"FAILED","OperationId":"placeholder0"}, {"Index":1,"Name":"1","Status":"FAILED","OperationId":"placeholder1"} ]} @@ -991,7 +991,7 @@ public async Task ParallelAsync_ReplayMixedStatus_PreservesStartedShortCircuited var b1 = ChildIdAt(parentOpId, 2); var summaryJson = """ - {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Branches":[ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ {"Index":0,"Name":"0","Status":"SUCCEEDED"}, {"Index":1,"Name":"1","Status":"SUCCEEDED"}, {"Index":2,"Name":"2","Status":"STARTED"} @@ -1066,7 +1066,7 @@ public async Task ParallelAsync_ReplayUsesCheckpointedBranchName_NotCurrentName( var b0 = ChildIdAt(parentOpId, 1); var summaryJson = """ - {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Branches":[ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ {"Index":0,"Name":"alpha","Status":"SUCCEEDED"}, {"Index":1,"Name":"beta","Status":"STARTED"} ]} @@ -1120,7 +1120,7 @@ public async Task ParallelAsync_ReplayWithDriftedBranchName_ThrowsNonDeterminist var b0 = ChildIdAt(parentOpId, 1); var summaryJson = """ - {"CompletionReason":"ALL_COMPLETED","Branches":[ + {"CompletionReason":"ALL_COMPLETED","Units":[ {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} ]} """; diff --git a/MAP-IMPLEMENTATION-PLAN.md b/MAP-IMPLEMENTATION-PLAN.md new file mode 100644 index 000000000..ab6d6e915 --- /dev/null +++ b/MAP-IMPLEMENTATION-PLAN.md @@ -0,0 +1,234 @@ +# MapAsync Implementation Plan (.NET Durable Execution SDK) — Wave 2 + +Tracking: follow-up to `ParallelAsync` (DOTNET-8662). This document is the +agreed plan before any code is written. It captures the cross-SDK research, +the locked-in decisions, and the concrete file-by-file changes. + +--- + +## 1. Background & research summary + +`MapAsync` processes a collection in parallel with configurable concurrency. +It is the sibling of the already-shipped `ParallelAsync`. The design doc +(`Docs/durable-execution-design.md`) specifies the public surface: + +```csharp +Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); +``` + +### Cross-SDK findings (Python / JavaScript / Java) + +| Aspect | Python | JavaScript | Java | Conclusion for .NET | +|--------|--------|------------|------|---------------------| +| Map vs Parallel | Siblings over shared `ConcurrentExecutor` | Siblings over shared `executeItemsConcurrently` engine | Siblings over shared `ConcurrencyOperation` | **Extract a shared base; Map & Parallel are thin subclasses.** | +| Per-item callback | `(ctx, item, index, items)` | `(ctx, item, index, array)` | `(item, index, ctx)` | Our design doc uses **`(ctx, item, index, allItems)`** — matches Python/JS (context-first). ✅ | +| Item → branch | 1 item = 1 child context | 1 item = 1 child context | 1 item = 1 child context | Same — reuse `ChildContextOperation` per item. | +| `ItemBatcher` | Config dataclass, **never wired into execution** | **Does not exist** | **Does not exist** | **Remove entirely** (decision below). | +| Default `CompletionConfig` | `all_successful()` (Parallel), permissive (Map) | fail-fast (both) | `allCompleted()` (both) | **Map default = `AllCompleted()`** (Python/Java majority); Parallel stays `AllSuccessful()`. | +| `toleratedFailurePercentage` units | 0–100 | 0–1 | 0–1 | Ours is **0.0–1.0** (already validated in `CompletionConfig`). ✅ | +| Subtypes | `MAP` / `MAP_ITERATION` | `MAP` / `MAP_ITERATION` | `MAP` / `MAP_ITERATION` | Add `Map` / `MapItem` constants. | +| Naming | `map-item-{i}` or `item_namer(item,i)` | `map-item-{i}` or `itemNamer(item,i)` | `{name}-iteration-{i}` | `ItemNamer(item, index)`; default = index string (consistent with Parallel's branch naming). | +| Empty collection | empty result, `ALL_COMPLETED` | empty result, `ALL_COMPLETED` | empty result (not replayable) | Empty → empty `BatchResult`, `AllCompleted`. | + +### Locked-in decisions (from user) + +1. **Extract a shared `ConcurrentOperation` base class.** Parallel and Map + become thin subclasses. (All three reference SDKs do this.) +2. **Remove `ItemBatcher` entirely** — no reference SDK implements it. Strip it + from `MapConfig` AND from the design doc. +3. **`MapConfig.CompletionConfig` defaults to `AllCompleted()`** (permissive), + matching Python + Java Map. Parallel's `AllSuccessful()` default is correct + and stays as-is (matches Python + JS Parallel). + +### Decisions NOT revisited + +- **Parallel default `AllSuccessful()`** — confirmed correct (Python + JS + majority). Not changing. +- **Empty `CompletionConfig` = permissive in .NET** (vs JS's empty = fail-fast). + Deliberate per DESIGN-QUESTIONS.md Q3 / REVIEW.md. Our model uses explicit + named factories (`AllSuccessful()` = `{ToleratedFailureCount=0}`, + `AllCompleted()` = empty). Map's permissive default is the explicit + `AllCompleted()` factory, so it never depends on the empty-config edge case. +- **One `MapAsync` overload** (not the 4 in the stale DESIGN-QUESTIONS.md). The + shipped serializer model pulls `ILambdaSerializer` from + `ILambdaContext.Serializer` via `LambdaSerializerHelper.GetRequired`, so the + `ICheckpointSerializer` AOT overloads do not apply. The design doc's single + signature is authoritative. + +--- + +## 2. Reuse map (what Map borrows from Parallel) + +| Component | Action | +|-----------|--------| +| `DurableOperation` base | Reuse unchanged | +| `ExecutionState` (thread-safe, `_lock`-guarded) | Reuse unchanged — REVIEW.md race already fixed | +| `OperationIdGenerator` / `HashOperationId` | Reuse unchanged — child IDs derived as `Hash($"{OperationId}-{index+1}")` in the base | +| `ChildContextOperation` | Reuse unchanged — each item runs as one child context | +| `BatchResult` / `BatchItem` | Reuse unchanged | +| `IBatchResult` / `IBatchItem` / `BatchItemStatus` | Reuse unchanged | +| `CompletionConfig` / `CompletionReason` / `NestingType` | Reuse unchanged | +| `ParallelSummary` / `ParallelJsonContext` | Generalize into a shared `BatchSummary` (see Step 3) | + +--- + +## 3. Implementation steps (ordered) + +### Step 1 — Extract `ConcurrentOperation` base class +**New file:** `Internal/ConcurrentOperation.cs` + +Move the reusable core out of `Internal/ParallelOperation.cs` (currently +lines 70–637) into an abstract base `ConcurrentOperation : DurableOperation>`: + +- `StartAsync` — sync-flush parent CONTEXT START (using `ParentSubType`), then `ExecuteItemsAsync`. +- `ReplayAsync` — the 4-way status dispatch (Succeeded → reconstruct; Failed → reconstruct + throw via `BuildException`; Started/Pending → re-execute; else `NonDeterministicExecutionException`). +- `ExecuteItemsAsync` — the full dispatch loop: `SemaphoreSlim` concurrency, the + orphan-task-safe `try/finally` that awaits all in-flight tasks before disposing + the semaphore, short-circuit checks, completion-reason computation, parent + checkpoint, throw-on-tolerance-exceeded. +- `RunUnitAsync(index, ...)` — wraps one unit in a `ChildContextOperation` + (child ID = `Hash($"{OperationId}-{index+1}")`, subtype = `ChildSubType`), + with the existing per-branch exception capture (ChildContextException → Failed + slot; structural DurableExecutionException → rethrow; OCE handling). +- `ShouldStopDispatching`, `ComputeCompletionReason`, `BranchOutcome` struct, + wire (de)serialization helpers, `DeserializeResult`, `CheckpointParentResultAsync`, + `ReconstructFromCheckpoints` — all move down. + +**Abstract/virtual hooks subclasses implement:** +```csharp +protected abstract int UnitCount; +protected abstract string ParentSubType; // OperationSubTypes.Parallel / .Map +protected abstract string ChildSubType; // .ParallelBranch / .MapItem +protected abstract (string? name, Func> func) GetUnit(int index); +protected abstract DurableExecutionException BuildException(IBatchResult result); +``` + +`ParallelOperation` then shrinks to: store `branches`, return +`OperationSubTypes.Parallel`/`ParallelBranch`, `GetUnit(i)` → `(branches[i].Name, branches[i].Func)`, +`BuildException` → `ParallelException`. **Existing 193 tests are the regression net.** + +### Step 2 — Operation subtype constants +**Edit:** `Operation.cs` → add to `OperationSubTypes`: +```csharp +public const string Map = "Map"; +public const string MapItem = "MapItem"; +``` + +### Step 3 — Generalize the checkpoint summary +**Edit:** `Internal/ParallelSummary.cs` → rename to shared `BatchSummary` / +`BatchUnitSummary` (or keep names, just broaden the doc comment). The shape +(`CompletionReason` + `[{Index, Name, Status}]`) is identical for both. +**Edit:** `Internal/ParallelJsonContext.cs` → rename to `BatchJsonContext` (one +shared source-gen context). Both subclasses use it via the base. Keeps a single +wire format and avoids drift. + +> Note: REVIEW.md issue #3 — `ParallelBranchSummary.OperationId` is dead. While +> generalizing, drop that field (smaller checkpoints) since reconstruction +> recomputes the ID by index. Confirm it isn't present before removing. + +### Step 4 — `MapConfig` + `MapException` +**New file:** `MapConfig.cs` — mirrors `ParallelConfig`: +- `int? MaxConcurrency` with `<= 0` rejection (same setter as ParallelConfig). +- `CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted();` ← **the key difference**. +- `NestingType NestingType { get; set; } = NestingType.Nested;` (Flat throws `NotSupportedException` at run). +- `Func? ItemNamer { get; set; }` — receives `(item, index)`, returns the item's branch name. **No `ItemBatcher`.** +- XML doc frames the permissive default as Python/Java parity: "Map collects all results by default; pass `CompletionConfig.AllSuccessful()` for fail-fast." + +**Edit:** `DurableExecutionException.cs` — add `MapException : DurableExecutionException` +mirroring `ParallelException` (carries `IBatchResult? Result`, `CompletionReason`, +the three ctors). Lets `catch` distinguish Map from Parallel failures. + +### Step 5 — `MapOperation` +**New file:** `Internal/MapOperation.cs` — `: ConcurrentOperation`. +Holds `items`, `func` (`Func, Task>`), +and `ItemNamer`. Implements the hooks: +- `UnitCount => items.Count` +- `ParentSubType => OperationSubTypes.Map`, `ChildSubType => OperationSubTypes.MapItem` +- `GetUnit(i)` → name = `ItemNamer?.Invoke(items[i]!, i) ?? i.ToString(InvariantCulture)`; + func = `ctx => func(ctx, items[i], i, items)` +- `BuildException` → `MapException` + +~60 lines. + +### Step 6 — Wire into the context +**Edit:** `IDurableContext.cs` — add the single `MapAsync` overload +(exact design-doc signature) with XML docs mirroring the `ParallelAsync` style. + +**Edit:** `DurableContext.cs` — add `MapAsync` + private `RunMap` +(mirrors `RunParallel`, lines 206–240): null/empty-arg validation, `Flat` guard, +serializer fetch, construct `MapOperation`, `ExecuteAsync`. Empty `items` → empty +`BatchResult` with `AllCompleted` (handled naturally by the base when `UnitCount == 0`). + +### Step 7 — Tests +**New file:** `test/.../MapOperationTests.cs` — mirror `ParallelOperationTests.cs` +(same `CreateContext` harness with `TestLambdaContext` + `DefaultLambdaJsonSerializer` ++ `RecordingBatcher`). Cover: +- Happy path (all items succeed, results in index order). +- Per-item failure capture under default `AllCompleted()` → **no throw**, failure in `IBatchResult.Failed`. +- `AllSuccessful()` override → one failure throws `MapException`. +- `ItemNamer` produces expected `IBatchItem.Name`; default naming = index. +- Empty collection → empty result, `AllCompleted`, no parent throw. +- `MaxConcurrency` (incl. the `>= count` no-semaphore optimization). +- `FirstSuccessful()` / `MinSuccessful` short-circuit → unfinished items = `Started`. +- Replay determinism: two fresh runs → identical item operation IDs. +- Replay from parent=SUCCEEDED → reconstruct results from child checkpoints. +- Mixed-status replay (some SUCCEEDED, some STARTED in summary). + +**New (DONE):** integration `test/.../IntegrationTests/TestFunctions/Map*` + +matching `Map*Test.cs`, mirroring the `Parallel*` set: HappyPath, PartialFailure +(permissive-default, the headline Map-vs-Parallel difference), FailureTolerance +(asserts `MapException`), FirstSuccessful, MaxConcurrency, ReplayDeterminism. All +6 function projects and the IntegrationTests assembly compile; the tests deploy +real Lambdas and require live AWS credentials to run. + +Re-run the **full suite on net8.0 + net10.0** to confirm the Step 1 base +extraction did not regress Parallel. + +### Step 8 — Documentation cleanup +**Edit:** `Docs/durable-execution-design.md`: +- Remove all `ItemBatcher` / `Batcher` references: the `MapConfig` block + (~lines 1369–1399), the cross-SDK "Item batching" row (~line 2132), and any + pipeline example using a batcher. +- Correct the `MapConfig.CompletionConfig` default in the doc to `AllCompleted()`. +- Note the (intentional) Parallel `AllSuccessful` vs Map `AllCompleted` default split. + +**Edit (optional):** annotate `DESIGN-QUESTIONS.md` stale bits (the +`ICheckpointSerializer` 4-overload section and any `ItemBatcher` mention) so the +record stays accurate. + +--- + +## 4. Intentional divergences (documented, not bugs) + +1. **Map default `AllCompleted()` vs Parallel default `AllSuccessful()`** — each + follows its own reference-SDK majority (Map: Python+Java; Parallel: Python+JS). +2. **One `MapAsync` overload** — superseded the stale 4-overload AOT design. +3. **`MapException`** is its own type (not reused `ParallelException`) so callers + can pattern-match the operation that failed. +4. **No `ItemBatcher`** — does not exist in JS/Java; inert in Python. + +--- + +## 5. File change checklist + +**New:** +- `Internal/ConcurrentOperation.cs` +- `Internal/MapOperation.cs` +- `MapConfig.cs` +- `test/.../MapOperationTests.cs` +- `test/.../IntegrationTests/TestFunctions/Map*` (×~6) + +**Edited:** +- `Internal/ParallelOperation.cs` (slimmed to subclass) +- `Internal/ParallelSummary.cs` → shared `BatchSummary` +- `Internal/ParallelJsonContext.cs` → shared `BatchJsonContext` +- `Operation.cs` (+2 subtype constants) +- `DurableExecutionException.cs` (+`MapException`) +- `IDurableContext.cs` (+`MapAsync` overload + docs) +- `DurableContext.cs` (+`MapAsync` + `RunMap`) +- `Docs/durable-execution-design.md` (remove ItemBatcher, fix default) From 3e13b76a1842d73b5aee6a414969940d7203ad3c Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 16:00:01 -0400 Subject: [PATCH 06/21] change file --- .autover/changes/durable-mapasync.json | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .autover/changes/durable-mapasync.json diff --git a/.autover/changes/durable-mapasync.json b/.autover/changes/durable-mapasync.json new file mode 100644 index 000000000..412e09055 --- /dev/null +++ b/.autover/changes/durable-mapasync.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add `MapAsync` to `IDurableContext` for processing a collection in parallel with one child context per item and automatic checkpointing. Supports configurable max concurrency, completion policy, and per-item naming via `MapConfig`, returning an `IBatchResult`." + ] + } + ] +} From b9fa51a7756290bfa48b60cd2a51642f76371dd1 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 17:14:40 -0400 Subject: [PATCH 07/21] Flat --- .../DurableContext.cs | 39 ++-- .../Internal/BatchJsonContext.cs | 1 + .../Internal/BatchSummary.cs | 27 ++- .../Internal/ChildContextOperation.cs | 92 ++++++--- .../Internal/ConcurrentOperation.cs | 89 +++++++- .../Internal/MapOperation.cs | 5 +- .../Internal/OperationIdGenerator.cs | 40 +++- .../Internal/ParallelOperation.cs | 5 +- .../MapConfig.cs | 6 +- .../NestingType.cs | 19 +- .../ParallelConfig.cs | 7 +- .../MapOperationTests.cs | 103 ++++++++- .../ParallelOperationTests.cs | 195 +++++++++++++++++- 13 files changed, 536 insertions(+), 92 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs index 6a271e670..54e30754a 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs @@ -219,12 +219,6 @@ private Task> RunParallel( } var effectiveConfig = config ?? new ParallelConfig(); - if (effectiveConfig.NestingType == NestingType.Flat) - { - throw new NotSupportedException( - "NestingType.Flat is not yet supported in the .NET Durable Execution SDK. " + - "Use NestingType.Nested (the default) for now."); - } var serializer = LambdaContext.Serializer ?? throw new InvalidOperationException( @@ -258,12 +252,6 @@ private Task> RunMap( if (func == null) throw new ArgumentNullException(nameof(func)); var effectiveConfig = config ?? new MapConfig(); - if (effectiveConfig.NestingType == NestingType.Flat) - { - throw new NotSupportedException( - "NestingType.Flat is not yet supported in the .NET Durable Execution SDK. " + - "Use NestingType.Nested (the default) for now."); - } var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); @@ -495,10 +483,31 @@ private Task RunInvoke( /// so its operation IDs are /// deterministically namespaced under the parent op ID. /// - private Func MakeChildFactory() + /// + /// Builds the factory each operation uses to create the inner + /// its user function runs against. + /// + /// + /// The delegate takes (operationId, reportedParentId, isVirtual): + /// + /// isVirtual == false (the default child-context case): the + /// inner context's ID space and reported parent both root at + /// operationId via ; + /// reportedParentId is ignored. + /// isVirtual == true (a branch): + /// inner-op IDs still root at operationId (so sibling branches + /// never collide), but inner ops report reportedParentId — the + /// parallel/map operation — as their parent, since the virtual branch + /// emits no CONTEXT checkpoint to reference. + /// + /// + private Func MakeChildFactory() { - return parentOpId => new DurableContext( - _state, _terminationManager, _idGenerator.CreateChild(parentOpId), + return (operationId, reportedParentId, isVirtual) => new DurableContext( + _state, _terminationManager, + isVirtual + ? _idGenerator.CreateVirtualChild(operationId, reportedParentId) + : _idGenerator.CreateChild(operationId), _durableExecutionArn, LambdaContext, _batcher); } } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs index d2bfeb32f..db97f02c1 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs @@ -10,6 +10,7 @@ namespace Amazon.Lambda.DurableExecution.Internal; /// [JsonSerializable(typeof(BatchSummary))] [JsonSerializable(typeof(BatchUnitSummary))] +[JsonSerializable(typeof(ErrorObject))] internal sealed partial class BatchJsonContext : JsonSerializerContext { } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs index 1e58e9654..b118ce558 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs @@ -8,9 +8,17 @@ namespace Amazon.Lambda.DurableExecution.Internal; /// Shared by both and /// : carries the completion reason and /// the per-unit index → status map so the can be -/// rebuilt without depending on user T shape — per-unit results live on the -/// children's own checkpoints. +/// rebuilt without depending on user T shape. /// +/// +/// Under per-unit results live on the children's +/// own CONTEXT checkpoints and only (plus +/// index/name) is recorded here. Under the +/// children emit no checkpoint, so each unit's serialized result +/// () or error +/// () is recorded inline here and read back +/// on replay. +/// internal sealed class BatchSummary { [JsonPropertyName("CompletionReason")] @@ -30,4 +38,19 @@ internal sealed class BatchUnitSummary [JsonPropertyName("Status")] public string? Status { get; set; } + + /// + /// Serialized per-unit result, recorded inline only for + /// succeeded units (where no child checkpoint + /// exists to read it from). null under . + /// + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// + /// Per-unit error, recorded inline only for + /// failed units. null under . + /// + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs index a0abbf99e..7c2427053 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -41,7 +41,8 @@ internal sealed class ChildContextOperation : DurableOperation private readonly Func> _func; private readonly ChildContextConfig? _config; private readonly ILambdaSerializer _serializer; - private readonly Func _childContextFactory; + private readonly Func _childContextFactory; + private readonly bool _isVirtual; public ChildContextOperation( string operationId, @@ -50,35 +51,45 @@ public ChildContextOperation( Func> func, ChildContextConfig? config, ILambdaSerializer serializer, - Func childContextFactory, + Func childContextFactory, ExecutionState state, TerminationManager termination, string durableExecutionArn, - CheckpointBatcher? batcher = null) + CheckpointBatcher? batcher = null, + bool isVirtual = false) : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) { _func = func; _config = config; _serializer = serializer; _childContextFactory = childContextFactory; + _isVirtual = isVirtual; } protected override string OperationType => OperationTypes.Context; protected override async Task StartAsync(CancellationToken cancellationToken) { - // Sync-flush CONTEXT START before user code so the service has a record - // of the parent context if the inner func suspends (e.g. a Wait inside - // the child terminates the workflow before SUCCEED is reached). - await EnqueueAsync(new SdkOperationUpdate + // Virtual (NestingType.Flat) branches emit no CONTEXT checkpoint of their + // own — the parallel/map orchestrator records their outcome inline on the + // parent payload. Inner operations still checkpoint (re-parented to the + // non-virtual ancestor via the virtual child generator's reported + // ParentId), so a suspend inside a virtual branch is still recoverable. + if (!_isVirtual) { - Id = OperationId, - ParentId = ParentId, - Type = OperationTypes.Context, - Action = OperationAction.START, - SubType = _config?.SubType, - Name = Name - }, cancellationToken); + // Sync-flush CONTEXT START before user code so the service has a record + // of the parent context if the inner func suspends (e.g. a Wait inside + // the child terminates the workflow before SUCCEED is reached). + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = _config?.SubType, + Name = Name + }, cancellationToken); + } return await ExecuteFunc(cancellationToken); } @@ -114,7 +125,11 @@ private async Task ExecuteFunc(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); - var childContext = _childContextFactory(OperationId); + // For a virtual (Flat) branch, inner operations report this branch's own + // ParentId — the non-virtual parallel/map ancestor — since the branch + // itself emits no CONTEXT checkpoint to reference. For a normal child + // context the reported parent is ignored (it roots at OperationId). + var childContext = _childContextFactory(OperationId, ParentId, _isVirtual); T result; try @@ -144,16 +159,22 @@ private async Task ExecuteFunc(CancellationToken cancellationToken) } catch (Exception ex) { - await EnqueueAsync(new SdkOperationUpdate + // Virtual branches suppress the FAIL checkpoint but still propagate + // the exception — the orchestrator records the failure inline on the + // parent payload. + if (!_isVirtual) { - Id = OperationId, - ParentId = ParentId, - Type = OperationTypes.Context, - Action = OperationAction.FAIL, - SubType = _config?.SubType, - Name = Name, - Error = ToSdkError(ex) - }, cancellationToken); + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.FAIL, + SubType = _config?.SubType, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + } throw MapFailureException(new ChildContextException(ex.Message, ex) { @@ -163,16 +184,21 @@ await EnqueueAsync(new SdkOperationUpdate }); } - await EnqueueAsync(new SdkOperationUpdate + // Virtual branches suppress the SUCCEED checkpoint; the orchestrator + // serializes the result inline on the parent payload instead. + if (!_isVirtual) { - Id = OperationId, - ParentId = ParentId, - Type = OperationTypes.Context, - Action = OperationAction.SUCCEED, - SubType = _config?.SubType, - Name = Name, - Payload = SerializeResult(result) - }, cancellationToken); + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.SUCCEED, + SubType = _config?.SubType, + Name = Name, + Payload = SerializeResult(result) + }, cancellationToken); + } return result; } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs index 9c28dc6f6..2a9c9bf37 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -48,11 +48,20 @@ internal abstract class ConcurrentOperation : DurableOperation + /// True for : per-unit child contexts emit no + /// CONTEXT checkpoint, so their results/errors are recorded inline on this + /// parent operation's payload and read back from + /// there on replay. + /// + private readonly bool _isVirtual; + /// Serializer used to deserialize per-unit child results on replay. protected readonly ILambdaSerializer Serializer; - /// Factory used to build each unit's inner child context. - protected readonly Func ChildContextFactory; + /// Factory used to build each unit's inner child context. Takes + /// (operationId, reportedParentId, isVirtual). + protected readonly Func ChildContextFactory; protected ConcurrentOperation( string operationId, @@ -61,17 +70,19 @@ protected ConcurrentOperation( CompletionConfig completionConfig, int? maxConcurrency, ILambdaSerializer serializer, - Func childContextFactory, + Func childContextFactory, ExecutionState state, TerminationManager termination, string durableExecutionArn, - CheckpointBatcher? batcher = null) + CheckpointBatcher? batcher = null, + bool isVirtual = false) : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) { _completionConfig = completionConfig; _maxConcurrency = maxConcurrency; Serializer = serializer; ChildContextFactory = childContextFactory; + _isVirtual = isVirtual; } protected override string OperationType => OperationTypes.Context; @@ -359,7 +370,8 @@ private async Task RunUnitAsync( State, Termination, DurableExecutionArn, - Batcher); + Batcher, + isVirtual: _isVirtual); try { @@ -516,17 +528,41 @@ private async Task CheckpointParentResultAsync( for (var i = 0; i < result.All.Count; i++) { var item = result.All[i]; - summary.Units.Add(new BatchUnitSummary + var unit = new BatchUnitSummary { Index = item.Index, Name = item.Name, Status = SerializeStatus(item.Status) - }); + }; + + // Flat (virtual) units emit no child checkpoint, so their per-unit + // result/error has nowhere to live except inline on this summary. + // Nested units leave these null — they're read from each child's own + // CONTEXT checkpoint on replay. + if (_isVirtual) + { + if (item.Status == BatchItemStatus.Succeeded) + { + unit.Result = SerializeResult(item.Result); + } + else if (item.Status == BatchItemStatus.Failed && item.Error != null) + { + unit.Error = ErrorObject.FromException(item.Error); + } + } + + summary.Units.Add(unit); } var payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); var failed = failureException != null; + // On FAIL, Nested operations omit the payload because replay rebuilds + // per-unit outcomes from the children's own checkpoints. Flat operations + // have no child checkpoints, so the summary (carrying inline results and + // errors) must be persisted even on FAIL for replay to reconstruct it. + var payloadOnFail = _isVirtual; + await EnqueueAsync(new SdkOperationUpdate { Id = OperationId, @@ -534,7 +570,7 @@ await EnqueueAsync(new SdkOperationUpdate Action = failed ? OperationAction.FAIL : OperationAction.SUCCEED, SubType = ParentSubType, Name = Name, - Payload = failed ? null : payload, + Payload = failed && !payloadOnFail ? null : payload, Error = failed ? BuildAggregateError(result, failureException!) : null }, cancellationToken); } @@ -573,7 +609,29 @@ private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwO T? unitResult = default; DurableExecutionException? unitError = null; - if (status == BatchItemStatus.Succeeded && childOp?.ContextDetails?.Result != null) + // Flat (virtual) units have no child checkpoint — their result/error + // was recorded inline on this summary. Nested units read from the + // child's own CONTEXT checkpoint. A unit is "inline" when the summary + // entry carries a Result/Error, which only Flat writes. + if (_isVirtual && summaryEntry != null) + { + if (status == BatchItemStatus.Succeeded && summaryEntry.Result != null) + { + unitResult = DeserializeResult(summaryEntry.Result); + } + else if (status == BatchItemStatus.Failed && summaryEntry.Error != null) + { + var err = summaryEntry.Error; + unitError = new ChildContextException(err.ErrorMessage ?? "Unit failed") + { + SubType = ChildSubType, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + } + else if (status == BatchItemStatus.Succeeded && childOp?.ContextDetails?.Result != null) { unitResult = DeserializeResult(childOp.ContextDetails.Result); } @@ -688,6 +746,19 @@ private T DeserializeResult(string serialized) return Serializer.Deserialize(ms); } + /// + /// Serializes a per-unit result for inline storage in the + /// (Flat units only). Mirrors the SUCCEED-payload + /// serialization a Nested unit's would + /// have written to its own checkpoint. + /// + private string SerializeResult(T? value) + { + using var ms = new MemoryStream(); + Serializer.Serialize(value!, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + /// /// Internal scratch space tracking each unit's outcome as it lands in the /// executor; copied into the user-facing once every diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs index 14df87c15..ed23ba950 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs @@ -31,13 +31,14 @@ public MapOperation( Func, Task> func, MapConfig config, ILambdaSerializer serializer, - Func childContextFactory, + Func childContextFactory, ExecutionState state, TerminationManager termination, string durableExecutionArn, CheckpointBatcher? batcher = null) : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, - serializer, childContextFactory, state, termination, durableExecutionArn, batcher) + serializer, childContextFactory, state, termination, durableExecutionArn, batcher, + isVirtual: config.NestingType == NestingType.Flat) { _items = items; _func = func; diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs index bbfd3c59d..bd74e6da5 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs @@ -35,10 +35,33 @@ public OperationIdGenerator() /// hash("<parentHash>-1"), hash("<parentHash>-2"), etc. /// public OperationIdGenerator(string? parentId) + : this(idPrefix: parentId, reportedParentId: parentId) + { + } + + /// + /// Creates a child generator that decouples the hash prefix used to derive + /// inner-operation IDs from the reported on those + /// operations' wire OperationUpdate.ParentId. + /// + /// + /// Prefix hashed into inner-operation IDs (hash("<idPrefix>-1"), ...). + /// Always the owning context's own operation ID, so two sibling branches + /// never collide on inner IDs. + /// + /// + /// The parent operation ID stamped on inner operations. For a normal + /// (non-virtual) context this equals . For a + /// branch — a "virtual" context that emits no + /// CONTEXT checkpoint of its own — this is the nearest non-virtual ancestor + /// (the parallel/map operation), so inner operations re-parent past the + /// branch to an operation that actually exists in the checkpoint store. + /// + private OperationIdGenerator(string? idPrefix, string? reportedParentId) { _counter = 0; - ParentId = parentId; - _prefix = parentId != null ? parentId + "-" : string.Empty; + ParentId = reportedParentId; + _prefix = idPrefix != null ? idPrefix + "-" : string.Empty; } /// @@ -85,6 +108,19 @@ public OperationIdGenerator CreateChild(string operationId) return new OperationIdGenerator(operationId); } + /// + /// Creates a child generator for a branch — a + /// "virtual" context. Inner-operation IDs are still derived from + /// (so sibling branches don't collide), but + /// the IDs are reported under (the + /// nearest non-virtual ancestor) because the virtual branch emits no CONTEXT + /// checkpoint that inner operations could reference as their parent. + /// + public OperationIdGenerator CreateVirtualChild(string operationId, string? reportedParentId) + { + return new OperationIdGenerator(idPrefix: operationId, reportedParentId: reportedParentId); + } + /// /// Resets the counter (used for testing only). Not safe to call concurrently /// with ; tests must quiesce before resetting. diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs index 8eff97668..08b7d1781 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -25,13 +25,14 @@ public ParallelOperation( IReadOnlyList> branches, ParallelConfig config, ILambdaSerializer serializer, - Func childContextFactory, + Func childContextFactory, ExecutionState state, TerminationManager termination, string durableExecutionArn, CheckpointBatcher? batcher = null) : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, - serializer, childContextFactory, state, termination, durableExecutionArn, batcher) + serializer, childContextFactory, state, termination, durableExecutionArn, batcher, + isVirtual: config.NestingType == NestingType.Flat) { _branches = branches; } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs index 967e5d17c..5b7c76e5f 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs @@ -58,9 +58,9 @@ public int? MaxConcurrency /// . /// /// - /// is not yet supported in the .NET SDK and - /// will throw when the map - /// operation is invoked. + /// Under each item runs in a virtual context + /// that emits no per-item CONTEXT checkpoint; per-item results and + /// errors are recorded inline on the map operation's payload instead. /// public NestingType NestingType { get; set; } = NestingType.Nested; diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs index ee2c15c96..a36c793e7 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs @@ -10,10 +10,9 @@ namespace Amazon.Lambda.DurableExecution; /// operation visible in execution traces. /// /// -/// is reserved for a forthcoming optimisation that uses -/// virtual contexts to reduce checkpoint volume by ~30%. The .NET SDK currently -/// throws when is -/// supplied; the enum value is kept stable so opting in becomes non-breaking. +/// uses virtual contexts to reduce checkpoint volume (no +/// per-branch CONTEXT operation): each branch's result or error is +/// recorded inline on the parent parallel/map operation's payload instead. /// /// public enum NestingType @@ -26,12 +25,12 @@ public enum NestingType Nested, /// - /// Branches use virtual contexts sharing the parent. Reduces checkpoint - /// cost at the expense of less granular execution traces. + /// Branches run in virtual contexts that emit no CONTEXT checkpoint + /// of their own — per-branch results/errors are recorded inline on the + /// parent operation's payload. Reduces checkpoint cost at the expense of + /// less granular execution traces. Branch operations inside a flat branch + /// (steps, waits) still checkpoint, re-parented to the parallel/map + /// operation. /// - /// - /// Not yet implemented in the .NET SDK; passing this value throws - /// . - /// Flat } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs index d40f09daf..bcc17f181 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs @@ -49,9 +49,10 @@ public int? MaxConcurrency /// . /// /// - /// is not yet supported in the .NET SDK and - /// will throw when the parallel - /// operation is invoked. + /// Under each branch runs in a virtual + /// context that emits no per-branch CONTEXT checkpoint; per-branch + /// results and errors are recorded inline on the parallel operation's + /// payload instead. /// public NestingType NestingType { get; set; } = NestingType.Nested; } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs index e67345760..0e796e7a0 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs @@ -385,15 +385,104 @@ public void MapConfig_DefaultCompletionConfig_IsAllCompleted() // ────────────────────────────────────────────────────────────────────── [Fact] - public async Task MapAsync_NestingTypeFlat_ThrowsNotSupported() + public async Task MapAsync_NestingTypeFlat_SuppressesPerItemContextOps() { - var (context, _, _, _) = CreateContext(); + var (context, recorder, _, _) = CreateContext(); - await Assert.ThrowsAsync(() => - context.MapAsync( - new[] { 1 }, - async (ctx, item, index, all) => { await Task.Yield(); return item; }, - config: new MapConfig { NestingType = NestingType.Flat })); + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item * 10; }, + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Parent Map CONTEXT ops still emitted; no per-item CONTEXT ops under Flat. + var parentActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "Map") + .Select(o => $"{o.Action}").ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, parentActions); + + Assert.Empty(recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "MapItem")); + } + + [Fact] + public async Task MapAsync_NestingTypeFlat_InnerOpsReparentToMapOp() + { + var (context, recorder, _, _) = CreateContext(); + + await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => + await ctx.StepAsync(async (_) => { await Task.Yield(); return item * 10; }), + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var item0Id = ChildIdAt(parentOpId, 1); + var item1Id = ChildIdAt(parentOpId, 2); + var step0Id = ChildIdAt(item0Id, 1); + var step1Id = ChildIdAt(item1Id, 1); + + // A step emits both START and SUCCEED under the same Id; scope to START + // so we assert on exactly one record per step. + var steps = recorder.Flushed + .Where(o => o.Type == "STEP" && $"{o.Action}" == "START").ToArray(); + var step0 = Assert.Single(steps, o => o.Id == step0Id); + var step1 = Assert.Single(steps, o => o.Id == step1Id); + + // Inner steps re-parent to the MAP op (nearest non-virtual ancestor). + Assert.Equal(parentOpId, step0.ParentId); + Assert.Equal(parentOpId, step1.ParentId); + } + + [Fact] + public async Task MapAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() + { + var parentOpId = IdAt(1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","Result":"10"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"20"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "doubler", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var executed = false; + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { executed = true; await Task.Yield(); return item * 999; }, + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + Assert.False(executed); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); } // ────────────────────────────────────────────────────────────────────── diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs index 160a56e1f..efc06655c 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -442,14 +442,201 @@ public void ParallelConfig_MaxConcurrency_OutOfRange_Throws() // ────────────────────────────────────────────────────────────────────── [Fact] - public async Task ParallelAsync_NestingTypeFlat_ThrowsNotSupported() + public async Task ParallelAsync_NestingTypeFlat_SuppressesPerBranchContextOps() { - var (context, _, _, _) = CreateContext(); + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 10; }, + async (_) => { await Task.Yield(); return 20; }, + async (_) => { await Task.Yield(); return 30; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Parent Parallel CONTEXT ops are still emitted (the parent is never + // virtual)... + var parentActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "Parallel") + .Select(o => $"{o.Action}").ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, parentActions); + + // ...but NO per-branch CONTEXT ops are emitted under Flat. + var branchOps = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch") + .ToArray(); + Assert.Empty(branchOps); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_InnerOpsReparentToParallelOp() + { + var (context, recorder, _, _) = CreateContext(); + + await context.ParallelAsync( + new Func>[] + { + async (ctx) => await ctx.StepAsync(async (_) => { await Task.Yield(); return 1; }), + async (ctx) => await ctx.StepAsync(async (_) => { await Task.Yield(); return 2; }), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var branch0Id = ChildIdAt(parentOpId, 1); + var branch1Id = ChildIdAt(parentOpId, 2); + + // Each branch's inner STEP is ID-derived from the branch op id (so the + // two branches' first steps don't collide)... + var step0Id = ChildIdAt(branch0Id, 1); + var step1Id = ChildIdAt(branch1Id, 1); + + // A step emits both START and SUCCEED under the same Id; scope to START + // so we assert on exactly one record per step. + var steps = recorder.Flushed + .Where(o => o.Type == "STEP" && $"{o.Action}" == "START").ToArray(); + var step0 = Assert.Single(steps, o => o.Id == step0Id); + var step1 = Assert.Single(steps, o => o.Id == step1Id); + + // ...but each inner step re-parents to the PARALLEL op (the nearest + // non-virtual ancestor), NOT to the virtual branch (which emitted no + // checkpoint to reference). + Assert.Equal(parentOpId, step0.ParentId); + Assert.Equal(parentOpId, step1.ParentId); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_PartialFailure_SurfacesInlineErrors() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("flat boom"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + Assert.Contains("flat boom", result.GetErrors()[0].Message); - await Assert.ThrowsAsync(() => + await recorder.Batcher.DrainAsync(); + + // The parent SUCCEED payload carries the inline per-unit results/errors; + // no per-branch FAIL op was emitted. + Assert.Empty(recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "ParallelBranch")); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() + { + var parentOpId = IdAt(1); + + // Flat replay reads per-unit results from the inline summary payload — + // there are NO per-branch child CONTEXT ops in state. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","Result":"100"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"200"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var executed = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { executed = true; await Task.Yield(); return 999; }, + async (_) => { executed = true; await Task.Yield(); return 999; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + Assert.False(executed); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ReplayFailed_ThrowsWithInlineError() + { + var parentOpId = IdAt(1); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED","Error":{"ErrorType":"System.InvalidOperationException","ErrorMessage":"flat branch 0 failed"}}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"200"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => context.ParallelAsync( - new Func>[] { async (_) => { await Task.Yield(); return 1; } }, + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }, + name: "fanout", config: new ParallelConfig { NestingType = NestingType.Flat })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + var typed = (IBatchResult)ex.Result!; + Assert.Equal(1, typed.FailureCount); + Assert.Contains("flat branch 0 failed", typed.GetErrors()[0].Message); } // ────────────────────────────────────────────────────────────────────── From 5f547fb6087fa554e43454e38d40955b9b94765b Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 18:11:56 -0400 Subject: [PATCH 08/21] add it tests --- .../MapFlatNestingTest.cs | 126 ++++++++++++++++ .../ParallelFlatNestingTest.cs | 135 ++++++++++++++++++ .../MapFlatNestingFunction/Dockerfile | 7 + .../MapFlatNestingFunction/Function.cs | 57 ++++++++ .../MapFlatNestingFunction.csproj | 18 +++ .../ParallelFlatNestingFunction/Dockerfile | 7 + .../ParallelFlatNestingFunction/Function.cs | 61 ++++++++ .../ParallelFlatNestingFunction.csproj | 18 +++ 8 files changed, 429 insertions(+) create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs new file mode 100644 index 000000000..b1c3f1e1a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs @@ -0,0 +1,126 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFlatNestingTest +{ + private readonly ITestOutputHelper _output; + public MapFlatNestingTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Item op ids are + /// SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way under + /// the item op id. Reproduced locally because OperationIdGenerator is internal + /// to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end map: three items, each with a + /// step + a durable wait (the wait forces a suspend/resume cycle so the map + /// actually replays). Verifies the Flat-specific contract against the real + /// durable-execution service: + /// 1. NO per-item CONTEXT events are emitted — only the parent Map CONTEXT. + /// 2. Each item's inner step/wait ops RE-PARENT to the Map op (the nearest + /// non-virtual ancestor), since the virtual item emits no CONTEXT + /// checkpoint to reference as a parent. + /// 3. Inner-op ids are still derived from the item op id space. + /// 4. The per-item result survives replay (read back from the inline parent + /// payload, not a per-item checkpoint). + /// + [Fact] + public async Task Map_Flat_SuppressesItemContexts_AndReparentsInnerOps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFlatNestingFunction"), + "mflat", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "mf1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The map parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var itemOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each item's "generate" step is the 1st inner op under that item's own + // id space: SHA256("-1"). + var expectedStepIds = itemOpIds.Select(i => HashOpId($"{i}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three items' inner step + // + wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Exactly ONE CONTEXT operation exists — the parent Map op. No per-item + // CONTEXT events under Flat. + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && itemOpIds.Contains(e.Id))); + + // 2. Each item's "generate" step re-parents to the Map op (NOT to its + // virtual item op). + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + // 3. ...but the step ids are still derived from the per-item id space, so + // the three items' first steps are distinct and match the expected + // SHA256("-1") values. + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened with no per-item checkpoint. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response carries the joined per-item results. + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs new file mode 100644 index 000000000..0f3450aa2 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs @@ -0,0 +1,135 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFlatNestingTest +{ + private readonly ITestOutputHelper _output; + public ParallelFlatNestingTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Branch op ids + /// are SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way + /// under the branch op id. Reproduced locally because OperationIdGenerator is + /// internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end parallel: three branches, each + /// with a step + a durable wait (the wait forces a suspend/resume cycle so the + /// parallel actually replays). Verifies the Flat-specific contract against the + /// real durable-execution service: + /// 1. NO per-branch CONTEXT events are emitted — only the parent Parallel + /// CONTEXT. (Under Nested there would be 4 ContextStarted; under Flat, + /// exactly 1.) + /// 2. Each branch's inner step/wait ops RE-PARENT to the Parallel op (the + /// nearest non-virtual ancestor), since the virtual branch emits no + /// CONTEXT checkpoint to reference as a parent. + /// 3. Inner-op ids are still derived from the branch op id (so the two + /// branches' first steps don't collide), even though they report the + /// Parallel op as parent. + /// 4. The per-branch result survives replay (the GUID generated inside + /// generate is preserved across suspend/resume — read back from the + /// inline parent payload, not a per-branch checkpoint). + /// + [Fact] + public async Task Parallel_Flat_SuppressesBranchContexts_AndReparentsInnerOps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFlatNestingFunction"), + "pflat", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "pf1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var branchOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each branch's "generate" step is the 1st inner op under that branch's + // own id space: SHA256("-1"). + var expectedStepIds = branchOpIds.Select(b => HashOpId($"{b}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three branches' inner + // step + wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Exactly ONE CONTEXT operation exists — the parent Parallel op. No + // per-branch CONTEXT events under Flat. + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && branchOpIds.Contains(e.Id))); + + // 2. Each branch's "generate" step re-parents to the Parallel op (NOT to + // its virtual branch op). + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + // 3. ...but the step ids are still derived from the per-branch id space, + // so the three branches' first steps are distinct and match the expected + // SHA256("-1") values. + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. The "generate" step succeeded exactly once per branch — proving + // replay returned the cached result rather than re-executing. + Assert.Equal(3, generateSteps.Count); + + // 5. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened with no per-branch checkpoint. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 6. The user-visible response carries the joined per-branch results. + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs new file mode 100644 index 000000000..9cd54aaba --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs @@ -0,0 +1,57 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items run under NestingType.Flat. Each item generates a fresh + // GUID inside a step, then does a durable wait. The wait forces a + // suspend/resume cycle, so the second invocation MUST replay the cached + // per-item result — and under Flat that result lives inline on the parent + // Map payload, not on a per-item CONTEXT checkpoint (none are emitted). + // If Flat replay is broken, the GUID would change between the original + // execution and replay, or the inner step/wait ops would reference a + // non-existent item parent. + var items = new[] { 0, 1, 2 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the map. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + }, + name: "fanout", + config: new MapConfig { NestingType = NestingType.Flat }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs new file mode 100644 index 000000000..dfbd6a345 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches run under NestingType.Flat. Each branch generates a + // fresh GUID inside a step, then does a durable wait. The wait forces a + // suspend/resume cycle, so the second invocation MUST replay the cached + // per-branch result — and under Flat that result lives inline on the + // parent Parallel payload, not on a per-branch CONTEXT checkpoint (none + // are emitted). If Flat replay is broken, the GUID would change between + // the original execution and replay, or the inner step/wait ops would + // reference a non-existent branch parent. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", BranchAsync), + new DurableBranch("b", BranchAsync), + new DurableBranch("c", BranchAsync), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } + + private static async Task BranchAsync(IDurableContext ctx) + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + From 26aabce3b2dff30423129906c85c000d396d622e Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 18:50:41 -0400 Subject: [PATCH 09/21] claude and update to net10 --- .../Amazon.Lambda.DurableExecution/CLAUDE.md | 151 ++++++++++++++++++ .../MapFlatNestingFunction.csproj | 2 +- .../ParallelFlatNestingFunction.csproj | 2 +- 3 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md b/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md new file mode 100644 index 000000000..b825300bd --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md @@ -0,0 +1,151 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What this is + +`Amazon.Lambda.DurableExecution` is the .NET SDK (preview, 0.x) for resilient, long-running AWS Lambda +workflows that checkpoint progress after each step and resume after failures or waits. A workflow can run +for up to ~1 year (the WAIT cap is 31,622,400 seconds) and is only billed for active compute. The SDK is +client-side glue: the *durable execution service* (part of Lambda) owns the checkpoint store, fires timers, +and re-invokes the function; this library re-derives in-memory workflow position from the checkpoint history +the service sends on each invocation. See sibling SDKs (Python/JS/Java) listed in `README.md` for the shared +model — this SDK deliberately mirrors their semantics. + +## Build & test + +Targets `net8.0;net10.0` (`DefaultPackageTargets` in `buildtools/common.props`). `TreatWarningsAsErrors` is on +everywhere, and the main library is `IsTrimmable` with the trim analyzer enabled — keep new code AOT/trim-clean. + +```bash +# Build the library (run from this directory) +dotnet build + +# Unit tests (fast, no AWS). Project: Libraries/test/Amazon.Lambda.DurableExecution.Tests +dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj + +# A single test +dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj \ + --filter "FullyQualifiedName~StepOperationTests" + +# Coverage report (requires reportgenerator tool) +../../test/Amazon.Lambda.DurableExecution.Tests/coverage.sh +``` + +Unit tests reach `internal` types via `InternalsVisibleTo` (declared in the `.csproj`). They use +`Amazon.Lambda.TestUtilities` (`TestLambdaContext`) and the real `SourceGeneratorLambdaJsonSerializer` — +set `TestLambdaContext.Serializer` so `LambdaSerializerHelper.GetRequired` finds one. + +### Integration tests (expensive, real AWS) + +`Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests` deploys real Lambdas. Each test builds a +`TestFunctions//` project into a container image via **`dotnet publish` + `docker build`**, pushes to ECR, +creates an IAM role + Lambda (`DurableFunctionDeployment`), invokes it, and tears everything down on dispose. +Requires Docker, AWS creds (us-east-1), and is slow. Every behavior in `docs/` should have a paired +integration test under that project. Prefix AWS commands with `unset AWS_PROFILE` to use `[default]` creds. + +**Run integration tests against `net10.0`.** The project multi-targets `net8.0;net10.0`; `dotnet test` +without a framework spins up one testhost per TFW and runs them concurrently, which races two processes on +the same `TestFunctions//` build dir. Pin the framework: + +```bash +dotnet test ../../test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj \ + -f net10.0 --filter "FullyQualifiedName~MultipleStepsTest" +``` + +## Architecture: the replay model + +This is the part you must understand before changing anything. Read these together: +`DurableFunction.cs`, `DurableExecutionHandler.cs`, `DurableContext.cs`, `Internal/DurableOperation.cs`, +`Internal/ExecutionState.cs`, `Internal/OperationIdGenerator.cs`, `Internal/TerminationManager.cs`. + +**Entry point.** The user's Lambda handler delegates to `DurableFunction.WrapAsync`, which: +hydrates `ExecutionState` from `invocationInput.InitialExecutionState` (paging the service via `NextMarker`), +extracts the user payload from the `EXECUTION`-type op, builds a `CheckpointBatcher` + `DurableContext`, runs +the workflow through `DurableExecutionHandler.RunAsync`, drains checkpoints, and maps the result to a +`DurableExecutionInvocationOutput` with status **Succeeded / Failed / Pending**. + +**Each operation runs the same workflow code every invocation.** There is no persisted program counter. +On re-invocation the user function executes from the top again; each durable call (`StepAsync`, `WaitAsync`, +etc.) looks up its own checkpoint and either replays the cached result or runs fresh. This is why workflow +code **must be deterministic** — same operations, same order, same names across deployments. + +**Deterministic operation IDs** (`OperationIdGenerator`). Each durable call gets an ID = SHA-256 of +`"-"`, where the counter is per-context and pre-incremented. The same workflow position +yields the same opaque ID across replays, so a checkpoint correlates to a call by *position*, not by name — +renaming a step does **not** break replay (the human name rides separately on `OperationUpdate.Name`). +Reordering or adding/removing calls *does* break it. `ValidateReplayConsistency` enforces this and throws +`NonDeterministicExecutionException` on type/name drift. + +**Suspension is implemented by never completing a Task** (`TerminationManager` + `DurableExecutionHandler`). +When an op must suspend (wait timer, scheduled retry, pending callback/invoke) it calls +`Termination.SuspendAndAwait()`, which trips a one-shot signal and returns a Task that *never resolves*. +`RunAsync` runs the user code via `Task.Run` and races it against `TerminationTask` with `Task.WhenAny`: +- user task wins → **Succeeded** (or **Failed** if it threw) +- termination wins → **Pending**; the abandoned user task is GC'd, checkpoints flush, the service fires the + timer and re-invokes. On replay the suspended op sees its now-terminal checkpoint and returns normally. + +**Operation classes** (`Internal/*Operation.cs`) all extend `DurableOperation`. The base's +`ExecuteAsync` does: `ValidateReplayConsistency` → `TrackReplay` → look up checkpoint → dispatch to +`StartAsync` (no prior checkpoint) or `ReplayAsync` (checkpoint exists). `StepOperation` is the canonical +example — read its class doc comment for the full status decision table (Succeeded→cached, Failed→rethrow, +Pending→re-suspend if retry timer hasn't fired, Started→crash-recovery under `AtMostOncePerRetry`, +Ready→run next attempt). `DurableContext` is a thin dispatcher: it allocates the op ID, pulls the serializer +off `ILambdaContext.Serializer`, constructs the right `*Operation`, and calls `ExecuteAsync`. + +**Checkpointing** (`CheckpointBatcher`). Outbound `OperationUpdate`s (START/SUCCEED/FAIL/RETRY) are enqueued +to a background channel worker that batches and flushes them via `LambdaDurableServiceClient` (which wraps +the `AWSSDK.Lambda` `Checkpoint`/`GetExecutionState` calls). `EnqueueAsync` awaits its batch's flush +(sync semantics); fire-and-forget callers (e.g. the START checkpoint under the default +`AtLeastOncePerRetry`) don't await but must observe the Task's exception. Flush errors become a terminal +error rethrown by the next `EnqueueAsync`/`DrainAsync`. `DurableFunction.IsTerminalCheckpointError` +classifies SDK errors on the final drain: 4xx (except 429 and stale-token) → **Failed** envelope; 429/5xx/ +network → let it escape so Lambda retries the whole invocation. + +**Replay-mode tracking** (`ExecutionState`). `IsReplaying` starts true iff any completed non-`EXECUTION` op +exists; `TrackReplay` decrements as each is visited and flips to false once the workflow catches up to the +frontier. `ReplayAwareLogger` uses this to suppress log lines emitted during replay so a 30-step workflow +re-invoked 30 times logs each line once — **always use `ctx.Logger`**, never `Console.WriteLine`. +`ExecutionState` is lock-guarded because the batcher worker thread and concurrent parallel/map branches all +touch it. + +### Operations surface (`IDurableContext`) + +`StepAsync` (checkpointed code + retries), `WaitAsync` (1s–~1yr timer), `RunInChildContextAsync` (isolated +sub-workflow checkpointed as one `CONTEXT` op), `CreateCallbackAsync` / `WaitForCallbackAsync` (external +events; `WaitForCallback` is *composed* from child-context + callback + submitter step — see +`DurableContext.RunWaitForCallback`), `InvokeAsync` (durable-to-durable chained invoke, qualified ARN +required), and `ParallelAsync` / `MapAsync` (concurrent branches → `IBatchResult`). + +**Nesting (`NestingType`)** matters for parallel/map. `Nested` (default) gives each branch a full `CONTEXT` +checkpoint. `Flat` runs branches in *virtual* contexts that emit no `CONTEXT` op — inner ops re-parent to the +parallel/map op via `OperationIdGenerator.CreateVirtualChild(operationId, reportedParentId)`, trading trace +granularity for fewer checkpoints. The `idPrefix` vs `reportedParentId` split is the subtle part: inner IDs +always derive from the branch's own op ID (so siblings never collide), but are *reported* under the nearest +non-virtual ancestor (so they reference a parent that actually exists in the checkpoint store). + +### Wire format (`Operation.cs`) + +`Operation` and its `*Details` types mirror the service envelope JSON exactly (`[JsonPropertyName]`). +String constants live in `OperationTypes` (STEP/WAIT/CALLBACK/CHAINED_INVOKE/CONTEXT/EXECUTION), +`OperationStatuses` (STARTED/SUCCEEDED/FAILED/PENDING/READY/CANCELLED/STOPPED/TIMED_OUT), and +`OperationSubTypes` (PascalCase finer classifier). Plural type names (`OperationTypes`, not `OperationType`) +intentionally avoid collision with `AWSSDK.Lambda` model enums. + +## Conventions + +- **Programming model:** preview supports only the *executable* model — `Main` builds a `LambdaBootstrap` + with a handler wrapper and an `ILambdaSerializer`. The serializer is read off `ILambdaContext.Serializer` + (a preview API; the project-wide `AWSLAMBDA001` suppression in the `.csproj` is intentional for that + reason). All step/result/payload (de)serialization flows through that one registered serializer, so AOT + and reflection callers share a single code path — there is no per-call `JsonSerializerContext` argument. +- **Errors:** durable exceptions carry `ErrorType`/`ErrorData`/`OriginalStackTrace` so a failure can be + reconstructed on replay when the live exception object is gone. `StepException`, `ChildContextException`, + `CallbackFailedException`/`CallbackTimeoutException`/`CallbackSubmitterException`, `ParallelException`, + `MapException`, and `NonDeterministicExecutionException` all derive from `DurableExecutionException`. + When adding error-mapping logic, handle *both* the fresh path (`InnerException` is the live exception) and + the replay path (`InnerException` is null, `ErrorType` carries the type string) — see + `DurableContext.MapWaitForCallbackException` for the pattern. +- **Public config types** (`StepConfig`, `WaitForCallbackConfig`, `ParallelConfig`, `MapConfig`, + `CompletionConfig`, etc.) are nullable optional args; resolve to an effective config inside the dispatcher. +- Inclusive language is enforced repo-wide (see the user's global rules): no master/slave, whitelist/blacklist. diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj index 6f5f657e4..f8bf7fd0c 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj @@ -1,7 +1,7 @@ - net8.0 + net10.0 Exe true bootstrap diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj index 6f5f657e4..f8bf7fd0c 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj @@ -1,7 +1,7 @@ - net8.0 + net10.0 Exe true bootstrap From 54a24e400c2c8e97c110a8fdbd0fe4340f2a1217 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 5 Jun 2026 18:59:19 -0400 Subject: [PATCH 10/21] Add autover change file for NestingType.Flat --- .../changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json diff --git a/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json b/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json new file mode 100644 index 000000000..42a1cec69 --- /dev/null +++ b/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Minor", + "ChangelogMessages": [ + "Implement NestingType.Flat for ParallelAsync and MapAsync (previously threw NotSupportedException). Under Flat, each branch/item runs in a virtual context that emits no per-branch CONTEXT checkpoint; per-branch results and errors are recorded inline on the parent operation's payload, reducing checkpoint volume. Operations inside a flat branch (steps, waits) still checkpoint, re-parented to the parallel/map operation. NestingType.Nested remains the default." + ] + } + ] +} From d54da3a3904dc12578b76a4a0b6017dfc12ea19d Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 11:19:34 -0400 Subject: [PATCH 11/21] feat(durable): add overflow threshold constants --- .../Internal/DurableConstants.cs | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs new file mode 100644 index 000000000..0aed925b2 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Size limits for durable-execution payload overflow handling. These are the +/// SDK's chosen overflow *trigger* thresholds for cross-SDK parity (Python/Java +/// use the same 256 KB), not the AWSSDK.Lambda hard field caps (those are 6 MB). +/// +internal static class DurableConstants +{ + /// + /// Serialized-payload byte length above which a concurrent/child-context + /// operation switches to the ReplayChildren overflow strategy: + /// strip the inline result from the checkpoint and reconstruct on replay by + /// re-executing the unit/child bodies. 256 KB (262,144 bytes). + /// + internal const int MaxOperationCheckpointBytes = 256 * 1024; + + /// + /// Serialized final-result byte length above which the orchestration response + /// must be checkpointed rather than returned inline (Lambda response limit, + /// minus a small envelope margin). Reserved for the final-response overflow + /// work (separate plan); defined here so all overflow limits live together. + /// + internal const int MaxLambdaResponseBytes = 6 * 1024 * 1024 - 50; +} From 3bb75fa7eb38de5a8ebb036b58beb6af4c6c700a Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 11:21:54 -0400 Subject: [PATCH 12/21] feat(durable): plumb inbound ContextDetails.ReplayChildren --- .../Operation.cs | 8 ++++++++ .../Services/LambdaDurableServiceClient.cs | 6 +++++- .../LambdaDurableServiceClientTests.cs | 20 +++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs index ebe99ba27..c6fddcf92 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs @@ -143,6 +143,14 @@ public sealed class ContextDetails /// Error from the child context, if any. [JsonPropertyName("Error")] public ErrorObject? Error { get; set; } + + /// + /// When true on a completed CONTEXT operation, the operation's result + /// was too large to checkpoint inline; per-unit/child state is reconstructed + /// on replay by re-executing the children rather than read from this payload. + /// + [JsonPropertyName("ReplayChildren")] + public bool? ReplayChildren { get; set; } } /// diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs index a38dda31b..d787a529b 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs @@ -161,7 +161,8 @@ private static Operation MapFromSdkOperation(SdkOperation sdkOp) ContextDetails = sdkOp.ContextDetails != null ? new ContextDetails { Result = sdkOp.ContextDetails.Result, - Error = MapError(sdkOp.ContextDetails.Error) + Error = MapError(sdkOp.ContextDetails.Error), + ReplayChildren = sdkOp.ContextDetails.ReplayChildren } : null, CallbackDetails = sdkOp.CallbackDetails != null ? new CallbackDetails { @@ -177,6 +178,9 @@ private static Operation MapFromSdkOperation(SdkOperation sdkOp) }; } + /// Test-only access to . + internal static Operation MapFromSdkOperationForTest(SdkOperation sdkOp) => MapFromSdkOperation(sdkOp); + /// /// Maps an SDK into the /// internal . Carries every field the wire object diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs index ab649f150..a49b8488e 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs @@ -382,6 +382,26 @@ public async Task GetExecutionStateAsync_MapFromSdkOperation_RoundTripsAllErrorF Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, invError.StackTrace!); } + [Fact] + public void MapFromSdkOperation_CopiesReplayChildren() + { + var sdkOp = new Amazon.Lambda.Model.Operation + { + Id = "ctx-1", + Type = "CONTEXT", + Status = "SUCCEEDED", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Result = "{}", + ReplayChildren = true + } + }; + + var mapped = LambdaDurableServiceClient.MapFromSdkOperationForTest(sdkOp); + + Assert.True(mapped.ContextDetails!.ReplayChildren); + } + [Fact] public async Task CheckpointAsync_ReturnsNewToken() { From 0b04a95da500084c933c88e3245aed4447d6f911 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 11:31:31 -0400 Subject: [PATCH 13/21] feat(durable): strip Flat batch summary + set ReplayChildren on overflow --- .../Internal/ConcurrentOperation.cs | 67 ++++++++++++------- .../ParallelOperationTests.cs | 36 ++++++++++ 2 files changed, 77 insertions(+), 26 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs index 2a9c9bf37..542f49f57 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -6,6 +6,7 @@ using System.Text.Json; using Amazon.Lambda; using Amazon.Lambda.Core; +using SdkContextOptions = Amazon.Lambda.Model.ContextOptions; using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; @@ -520,47 +521,58 @@ private async Task CheckpointParentResultAsync( DurableExecutionException? failureException, CancellationToken cancellationToken) { - var summary = new BatchSummary + // Local builder: includeInline=true writes per-unit Result/Error inline + // (Flat only); includeInline=false writes the minimal index/name/status + // map (the shape Nested always uses, and the Flat overflow fallback). + BatchSummary BuildSummary(bool includeInline) { - CompletionReason = SerializeCompletionReason(completionReason), - Units = new List(result.All.Count) - }; - for (var i = 0; i < result.All.Count; i++) - { - var item = result.All[i]; - var unit = new BatchUnitSummary + var s = new BatchSummary { - Index = item.Index, - Name = item.Name, - Status = SerializeStatus(item.Status) + CompletionReason = SerializeCompletionReason(completionReason), + Units = new List(result.All.Count) }; - - // Flat (virtual) units emit no child checkpoint, so their per-unit - // result/error has nowhere to live except inline on this summary. - // Nested units leave these null — they're read from each child's own - // CONTEXT checkpoint on replay. - if (_isVirtual) + for (var i = 0; i < result.All.Count; i++) { - if (item.Status == BatchItemStatus.Succeeded) + var item = result.All[i]; + var unit = new BatchUnitSummary { - unit.Result = SerializeResult(item.Result); - } - else if (item.Status == BatchItemStatus.Failed && item.Error != null) + Index = item.Index, + Name = item.Name, + Status = SerializeStatus(item.Status) + }; + if (includeInline && _isVirtual) { - unit.Error = ErrorObject.FromException(item.Error); + if (item.Status == BatchItemStatus.Succeeded) + unit.Result = SerializeResult(item.Result); + else if (item.Status == BatchItemStatus.Failed && item.Error != null) + unit.Error = ErrorObject.FromException(item.Error); } + s.Units.Add(unit); } - - summary.Units.Add(unit); + return s; } + var summary = BuildSummary(includeInline: true); var payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); + + // Flat overflow: the inline per-unit results pushed the summary over the + // checkpoint limit. Re-emit a stripped summary (statuses only) and flag + // ReplayChildren so replay reconstructs the values by re-executing units. + var overflow = _isVirtual + && Encoding.UTF8.GetByteCount(payload) > DurableConstants.MaxOperationCheckpointBytes; + if (overflow) + { + summary = BuildSummary(includeInline: false); + payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); + } + var failed = failureException != null; // On FAIL, Nested operations omit the payload because replay rebuilds // per-unit outcomes from the children's own checkpoints. Flat operations // have no child checkpoints, so the summary (carrying inline results and - // errors) must be persisted even on FAIL for replay to reconstruct it. + // errors, or the stripped status map under overflow) must be persisted + // even on FAIL for replay to reconstruct it. var payloadOnFail = _isVirtual; await EnqueueAsync(new SdkOperationUpdate @@ -571,7 +583,10 @@ await EnqueueAsync(new SdkOperationUpdate SubType = ParentSubType, Name = Name, Payload = failed && !payloadOnFail ? null : payload, - Error = failed ? BuildAggregateError(result, failureException!) : null + Error = failed ? BuildAggregateError(result, failureException!) : null, + ContextOptions = overflow + ? new SdkContextOptions { ReplayChildren = true } + : null }, cancellationToken); } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs index efc06655c..1b75ac5b5 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -547,6 +547,42 @@ public async Task ParallelAsync_NestingTypeFlat_PartialFailure_SurfacesInlineErr o.Type == "CONTEXT" && o.SubType == "ParallelBranch")); } + [Fact] + public async Task ParallelAsync_Flat_ResultOverThreshold_StripsInlineResultsAndSetsReplayChildren() + { + var (context, recorder, _, _) = CreateContext(); + + // Each branch returns a ~200 KB string; the summary with both inline + // exceeds the 256 KB checkpoint threshold. + var big = new string('x', 200 * 1024); + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return big; }, + async (_) => { await Task.Yield(); return big; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + // In-memory result for the current invoke still carries the full values. + Assert.Equal(2, result.SuccessCount); + Assert.All(result.GetResults(), r => Assert.Equal(big, r)); + + await recorder.Batcher.DrainAsync(); + + var parentSucceed = recorder.Flushed.Single(o => + o.Type == "CONTEXT" && o.SubType == "Parallel" && o.Action == "SUCCEED"); + + // Overflow: ReplayChildren flag set, payload stripped under the threshold. + Assert.NotNull(parentSucceed.ContextOptions); + Assert.True(parentSucceed.ContextOptions.ReplayChildren); + Assert.True(System.Text.Encoding.UTF8.GetByteCount(parentSucceed.Payload) + <= Amazon.Lambda.DurableExecution.Internal.DurableConstants.MaxOperationCheckpointBytes); + // Stripped summary keeps statuses but not the big inline results. + Assert.DoesNotContain(big, parentSucceed.Payload); + Assert.Contains("SUCCEEDED", parentSucceed.Payload); + } + [Fact] public async Task ParallelAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() { From 29078298b810dc3d505b579d47cfbaf69ed94588 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 11:51:16 -0400 Subject: [PATCH 14/21] feat(durable): re-execute units on ReplayChildren overflow replay --- .../Internal/ConcurrentOperation.cs | 35 +++++++++++- .../ParallelOperationTests.cs | 56 +++++++++++++++++++ 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs index 542f49f57..dc0a554e3 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -138,8 +138,22 @@ await EnqueueAsync(new SdkOperationUpdate protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) { + // Overflow replay: the parent was checkpointed with a stripped summary and + // ReplayChildren=true because the inline results exceeded the checkpoint + // limit. Re-execute the unit bodies to recover their result VALUES, but + // keep the frozen statuses/completion-reason authoritative (read from the + // summary inside ExecuteUnitsAsync). The parent is already terminal, so do + // NOT re-emit its checkpoint. + var replayChildren = existing.ContextDetails?.ReplayChildren == true + && (existing.Status == OperationStatuses.Succeeded + || existing.Status == OperationStatuses.Failed); + switch (existing.Status) { + case OperationStatuses.Succeeded when replayChildren: + case OperationStatuses.Failed when replayChildren: + return ExecuteUnitsAsync(cancellationToken, frozen: existing); + case OperationStatuses.Succeeded: return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); @@ -160,10 +174,18 @@ protected override Task> ReplayAsync(Operation existing, Cancell } } - private async Task> ExecuteUnitsAsync(CancellationToken cancellationToken) + private async Task> ExecuteUnitsAsync( + CancellationToken cancellationToken, + Operation? frozen = null) { cancellationToken.ThrowIfCancellationRequested(); + // Overflow replay: when re-executing solely to recover stripped result + // VALUES, the per-unit statuses and completion reason are authoritative + // from the frozen summary — re-deriving them would reintroduce + // completion-order non-determinism the summary was written to prevent. + var frozenSummary = frozen != null ? ParseSummary(frozen.ContextDetails?.Result) : null; + var unitCount = UnitCount; var slots = new UnitOutcome[unitCount]; var dispatched = new bool[unitCount]; @@ -331,14 +353,21 @@ private async Task> ExecuteUnitsAsync(CancellationToken cancella } } - var completionReason = ComputeCompletionReason(items, unitCount); + var completionReason = frozenSummary != null + ? DeserializeCompletionReason(frozenSummary.CompletionReason) + : ComputeCompletionReason(items, unitCount); var result = new BatchResult(items, completionReason); var failureException = completionReason == CompletionReason.FailureToleranceExceeded ? BuildException(result) : null; - await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); + // Overflow replay re-executes only to recover stripped values; the parent + // checkpoint already exists and is terminal, so do NOT re-emit it. + if (frozen == null) + { + await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); + } if (failureException != null) { diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs index 1b75ac5b5..b07bd8f8c 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -583,6 +583,62 @@ public async Task ParallelAsync_Flat_ResultOverThreshold_StripsInlineResultsAndS Assert.Contains("SUCCEEDED", parentSucceed.Payload); } + [Fact] + public async Task ParallelAsync_Flat_ReplayChildren_ReExecutesBodiesWithoutRecheckpointing() + { + var parentOpId = IdAt(1); + + // Stripped summary: statuses present, NO inline Result values. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails + { + Result = summaryJson, + ReplayChildren = true + } + } + } + }); + + var executions = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 100; }, + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 200; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + // Bodies re-executed (values recovered), statuses/reason from frozen summary. + Assert.Equal(2, executions); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // The parent is already terminal in state — replay must NOT re-emit a + // parent CONTEXT SUCCEED/FAIL. + Assert.DoesNotContain(recorder.Flushed, o => + o.Type == "CONTEXT" && o.SubType == "Parallel"); + } + [Fact] public async Task ParallelAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() { From 94c1ac74d946373f524a909317a5bfef884c7926 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:06:45 -0400 Subject: [PATCH 15/21] feat(durable): gate overflow-replay re-execution by frozen unit status --- .../Internal/ConcurrentOperation.cs | 265 ++++++++++++------ .../ParallelOperationTests.cs | 134 +++++++++ 2 files changed, 311 insertions(+), 88 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs index dc0a554e3..83edc1632 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -140,10 +140,11 @@ protected override Task> ReplayAsync(Operation existing, Cancell { // Overflow replay: the parent was checkpointed with a stripped summary and // ReplayChildren=true because the inline results exceeded the checkpoint - // limit. Re-execute the unit bodies to recover their result VALUES, but - // keep the frozen statuses/completion-reason authoritative (read from the - // summary inside ExecuteUnitsAsync). The parent is already terminal, so do - // NOT re-emit its checkpoint. + // limit. Re-execute ONLY the units the frozen summary marks SUCCEEDED or + // FAILED to recover their stripped result VALUE / Error; units marked + // STARTED (short-circuited, never dispatched) are skipped. Per-unit status + // and completion reason stay authoritative from the frozen summary, and the + // parent — already terminal — is NOT re-checkpointed. var replayChildren = existing.ContextDetails?.ReplayChildren == true && (existing.Status == OperationStatuses.Succeeded || existing.Status == OperationStatuses.Failed); @@ -152,7 +153,7 @@ protected override Task> ReplayAsync(Operation existing, Cancell { case OperationStatuses.Succeeded when replayChildren: case OperationStatuses.Failed when replayChildren: - return ExecuteUnitsAsync(cancellationToken, frozen: existing); + return ReplayChildrenAsync(existing, cancellationToken); case OperationStatuses.Succeeded: return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); @@ -174,18 +175,10 @@ protected override Task> ReplayAsync(Operation existing, Cancell } } - private async Task> ExecuteUnitsAsync( - CancellationToken cancellationToken, - Operation? frozen = null) + private async Task> ExecuteUnitsAsync(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); - // Overflow replay: when re-executing solely to recover stripped result - // VALUES, the per-unit statuses and completion reason are authoritative - // from the frozen summary — re-deriving them would reintroduce - // completion-order non-determinism the summary was written to prevent. - var frozenSummary = frozen != null ? ParseSummary(frozen.ContextDetails?.Result) : null; - var unitCount = UnitCount; var slots = new UnitOutcome[unitCount]; var dispatched = new bool[unitCount]; @@ -353,21 +346,14 @@ private async Task> ExecuteUnitsAsync( } } - var completionReason = frozenSummary != null - ? DeserializeCompletionReason(frozenSummary.CompletionReason) - : ComputeCompletionReason(items, unitCount); + var completionReason = ComputeCompletionReason(items, unitCount); var result = new BatchResult(items, completionReason); var failureException = completionReason == CompletionReason.FailureToleranceExceeded ? BuildException(result) : null; - // Overflow replay re-executes only to recover stripped values; the parent - // checkpoint already exists and is terminal, so do NOT re-emit it. - if (frozen == null) - { - await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); - } + await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); if (failureException != null) { @@ -377,81 +363,107 @@ private async Task> ExecuteUnitsAsync( return result; } - private async Task RunUnitAsync( - int index, - UnitOutcome[] slots, - SemaphoreSlim? semaphore, - CancellationToken cancellationToken, - Action onComplete) + /// + /// Overflow-replay path. The parent was checkpointed with a stripped summary + /// (per-unit Index/Name/Status retained; Result/Error dropped) and + /// ReplayChildren=true. Re-executes ONLY the units the frozen summary + /// marks SUCCEEDED or FAILED — to recover their stripped result value / error + /// — and skips units marked STARTED so their bodies do not re-run. Per-unit + /// status and the completion reason come from the frozen summary (authoritative), + /// not from this run's outcomes; the parent is NOT re-checkpointed. + /// + private async Task> ReplayChildrenAsync(Operation frozen, CancellationToken cancellationToken) { - try + cancellationToken.ThrowIfCancellationRequested(); + + var summary = ParseSummary(frozen.ContextDetails?.Result); + var unitCount = UnitCount; + + var items = new List>(unitCount); + for (var i = 0; i < unitCount; i++) { - var (unitName, unitFunc) = GetUnit(index); - var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); - - var childOp = new ChildContextOperation( - childOpId, - unitName, - OperationId, - unitFunc, - new ChildContextConfig { SubType = ChildSubType }, - Serializer, - ChildContextFactory, - State, - Termination, - DurableExecutionArn, - Batcher, - isVirtual: _isVirtual); + var (unitName, _) = GetUnit(i); + var summaryEntry = summary?.Units.FirstOrDefault(b => b.Index == i); - try - { - var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); - slots[index] = new UnitOutcome { Status = BatchItemStatus.Succeeded, Result = result }; - } - catch (ChildContextException ex) - { - slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = ex }; - } - catch (DurableExecutionException) - { - // E.g. NonDeterministicExecutionException — these are not "unit - // failed gracefully" but workflow-level problems. Surface them: - // re-throw out of the operation without writing a slot (the - // orchestrator's outer flow handles it). - throw; - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + // Frozen per-unit status is authoritative. + var status = summaryEntry != null + ? DeserializeStatus(summaryEntry.Status) + : BatchItemStatus.Started; + + // Same unit-name drift check as ReconstructFromCheckpoints: code must + // not change the order or name of concurrent units between deployments. + var checkpointedName = summaryEntry?.Name; + if (checkpointedName != null && unitName != null && checkpointedName != unitName) { - // Parent-token cancellation: per cross-cutting decision Q10, OCE - // escapes unwrapped. Don't write a slot — Task.WhenAll observes - // this and the orchestrator re-throws after settling. - throw; + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for {OperationNoun.ToLowerInvariant()} unit {i} of operation " + + $"'{Name ?? OperationId}': expected name '{unitName}' but found '{checkpointedName}' " + + $"from a previous invocation. Code must not change the order or name of concurrent " + + $"units between deployments."); } - catch (OperationCanceledException ex) + var resolvedName = checkpointedName ?? unitName; + + T? unitResult = default; + DurableExecutionException? unitError = null; + + // Re-execute only completed units to recover the stripped value/error. + // STARTED units were short-circuited (never dispatched) originally — + // do NOT run their bodies, so there are no spurious side effects. + if (status == BatchItemStatus.Succeeded || status == BatchItemStatus.Failed) { - // Unit-internal cancellation that is NOT tied to the parent token - // (e.g. the unit's own CancellationTokenSource fired). Treat it as - // a normal per-unit failure rather than killing the operation as - // cancelled. - var wrapped = new ChildContextException(ex.Message, ex) + var outcome = await RunSingleUnitAsync(i, cancellationToken).ConfigureAwait(false); + if (status == BatchItemStatus.Succeeded) { - SubType = ChildSubType, - ErrorType = ex.GetType().FullName - }; - slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; - } - catch (Exception ex) - { - // Wrap unexpected exceptions as ChildContextException — they're - // per-unit failures from the user's POV. - var wrapped = new ChildContextException(ex.Message, ex) + unitResult = outcome.Result; + } + else { - SubType = ChildSubType, - ErrorType = ex.GetType().FullName - }; - slots[index] = new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + // Frozen status is authoritative. If a unit frozen as Failed + // re-executes to success here (non-deterministic body), it stays + // Failed but Error stays null — the original error was stripped on + // overflow and only returns if the body re-throws. Recovering a + // frozen-Succeeded unit's value is the common, supported case. + unitError = outcome.Error; + } } + items.Add(new BatchItem + { + Index = i, + Name = resolvedName, + Status = status, + Result = unitResult, + Error = unitError + }); + } + + // Completion reason is pinned from the frozen summary; fall back to + // recomputing only if the summary is absent/corrupt. + var completionReason = summary != null + ? DeserializeCompletionReason(summary.CompletionReason) + : ComputeCompletionReason(items, unitCount); + + var result = new BatchResult(items, completionReason); + + // No re-checkpoint: the parent is already terminal in state. + if (completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildException(result); + } + + return result; + } + + private async Task RunUnitAsync( + int index, + UnitOutcome[] slots, + SemaphoreSlim? semaphore, + CancellationToken cancellationToken, + Action onComplete) + { + try + { + slots[index] = await RunSingleUnitAsync(index, cancellationToken).ConfigureAwait(false); onComplete(slots[index]); } finally @@ -471,6 +483,83 @@ private async Task RunUnitAsync( } } + /// + /// Builds and runs a single unit's and + /// maps the result/exception to a . Shared by the + /// concurrent dispatch loop () and the overflow + /// ReplayChildren path (). Per-unit graceful + /// failures are captured as ; workflow-level + /// and parent-token-cancellation exceptions propagate. + /// + private async Task RunSingleUnitAsync(int index, CancellationToken cancellationToken) + { + var (unitName, unitFunc) = GetUnit(index); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); + + var childOp = new ChildContextOperation( + childOpId, + unitName, + OperationId, + unitFunc, + new ChildContextConfig { SubType = ChildSubType }, + Serializer, + ChildContextFactory, + State, + Termination, + DurableExecutionArn, + Batcher, + isVirtual: _isVirtual); + + try + { + var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); + return new UnitOutcome { Status = BatchItemStatus.Succeeded, Result = result }; + } + catch (ChildContextException ex) + { + return new UnitOutcome { Status = BatchItemStatus.Failed, Error = ex }; + } + catch (DurableExecutionException) + { + // E.g. NonDeterministicExecutionException — these are not "unit + // failed gracefully" but workflow-level problems. Surface them: + // re-throw out of the operation (the orchestrator's outer flow + // handles it). + throw; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // Parent-token cancellation: per cross-cutting decision Q10, OCE + // escapes unwrapped. Don't write a slot — Task.WhenAll observes + // this and the orchestrator re-throws after settling. + throw; + } + catch (OperationCanceledException ex) + { + // Unit-internal cancellation that is NOT tied to the parent token + // (e.g. the unit's own CancellationTokenSource fired). Treat it as + // a normal per-unit failure rather than killing the operation as + // cancelled. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + return new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + catch (Exception ex) + { + // Wrap unexpected exceptions as ChildContextException — they're + // per-unit failures from the user's POV. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + return new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + } + private static bool ShouldStopDispatching( int succeeded, int failed, diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs index b07bd8f8c..d2b902a3e 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -639,6 +639,140 @@ public async Task ParallelAsync_Flat_ReplayChildren_ReExecutesBodiesWithoutReche o.Type == "CONTEXT" && o.SubType == "Parallel"); } + [Fact] + public async Task ParallelAsync_Flat_ReplayChildren_SkipsStartedUnits_ReExecutesCompletedOnly() + { + var parentOpId = IdAt(1); + + // Stripped summary: two units short-circuited the run with MinSuccessful=2 + // (SUCCEEDED, SUCCEEDED), the third was never dispatched (STARTED). On + // overflow replay only the two completed units re-execute; the started + // unit's body must NOT run. + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails + { + Result = summaryJson, + ReplayChildren = true + } + } + } + }); + + var executions = 0; + var startedBodyRan = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 100; }, + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 200; }, + async (_) => { startedBodyRan = true; Interlocked.Increment(ref executions); await Task.Yield(); return 300; }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = new CompletionConfig { MinSuccessful = 2 } + }); + + // Only the two SUCCEEDED unit bodies re-execute; the STARTED unit is skipped. + Assert.Equal(2, executions); + Assert.False(startedBodyRan); + + // Per-item statuses come from the frozen summary. + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + + // Recovered values for the two succeeded units. + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => + o.Type == "CONTEXT" && o.SubType == "Parallel"); + } + + [Fact] + public async Task ParallelAsync_Flat_ReplayChildren_ReExecutesFailedUnit_RecoversError() + { + var parentOpId = IdAt(1); + + // Stripped summary: one SUCCEEDED, one FAILED. Errors were stripped on + // overflow, so re-execution recovers them. Tolerated-failure config keeps + // the run from throwing. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"FAILED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails + { + Result = summaryJson, + ReplayChildren = true + } + } + } + }); + + var executions = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 100; }, + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); throw new InvalidOperationException("flat boom"); }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Both bodies re-execute to recover the value and the error. + Assert.Equal(2, executions); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Failed, result.All[1].Status); + Assert.Equal(100, result.All[0].Result); + Assert.NotNull(result.All[1].Error); + Assert.Contains("flat boom", result.All[1].Error!.Message); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => + o.Type == "CONTEXT" && o.SubType == "Parallel"); + } + [Fact] public async Task ParallelAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() { From db05bca04d886dc0af9d263e70012bac2ebb3801 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:15:16 -0400 Subject: [PATCH 16/21] feat(durable): ChildContext single-child overflow via ReplayChildren --- .../Internal/ChildContextOperation.cs | 32 ++++++++++- .../ChildContextOperationTests.cs | 54 +++++++++++++++++++ 2 files changed, 84 insertions(+), 2 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs index 7c2427053..de8b9129e 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -5,6 +5,7 @@ using System.Text; using Amazon.Lambda; using Amazon.Lambda.Core; +using SdkContextOptions = Amazon.Lambda.Model.ContextOptions; using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; @@ -43,6 +44,7 @@ internal sealed class ChildContextOperation : DurableOperation private readonly ILambdaSerializer _serializer; private readonly Func _childContextFactory; private readonly bool _isVirtual; + private bool _suppressSuccessCheckpoint; public ChildContextOperation( string operationId, @@ -99,6 +101,14 @@ protected override Task ReplayAsync(Operation existing, CancellationToken can switch (existing.Status) { case OperationStatuses.Succeeded: + // Overflow: the result was too large to checkpoint inline + // (ReplayChildren=true, empty payload). Re-run the body to recover + // the value; the body's inner ops replay from their own + // checkpoints. Do NOT re-emit the (already terminal) SUCCEED. + if (existing.ContextDetails?.ReplayChildren == true) + { + return ExecuteFuncNoCheckpoint(cancellationToken); + } // Side-effecting code runs at most once: replay returns the // cached result without invoking the user func. return Task.FromResult(DeserializeResult(existing.ContextDetails?.Result)); @@ -121,6 +131,12 @@ protected override Task ReplayAsync(Operation existing, CancellationToken can } } + private Task ExecuteFuncNoCheckpoint(CancellationToken cancellationToken) + { + _suppressSuccessCheckpoint = true; + return ExecuteFunc(cancellationToken); + } + private async Task ExecuteFunc(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); @@ -186,8 +202,17 @@ await EnqueueAsync(new SdkOperationUpdate // Virtual branches suppress the SUCCEED checkpoint; the orchestrator // serializes the result inline on the parent payload instead. - if (!_isVirtual) + // _suppressSuccessCheckpoint is set on overflow replay re-execution: the + // child is already terminal in the store, so we re-run only to recover the + // in-memory value and must NOT re-emit a SUCCEED. + if (!_isVirtual && !_suppressSuccessCheckpoint) { + var serialized = SerializeResult(result); + // Overflow: result too large to checkpoint inline. Emit an empty + // payload + ReplayChildren so replay re-executes this body to recover + // the value (mirrors the concurrent-operation overflow strategy). + var overflow = Encoding.UTF8.GetByteCount(serialized) > DurableConstants.MaxOperationCheckpointBytes; + await EnqueueAsync(new SdkOperationUpdate { Id = OperationId, @@ -196,7 +221,10 @@ await EnqueueAsync(new SdkOperationUpdate Action = OperationAction.SUCCEED, SubType = _config?.SubType, Name = Name, - Payload = SerializeResult(result) + Payload = overflow ? string.Empty : serialized, + ContextOptions = overflow + ? new SdkContextOptions { ReplayChildren = true } + : null }, cancellationToken); } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs index 3aa182248..f11254182 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs @@ -501,6 +501,60 @@ await Assert.ThrowsAsync(() => name: "phase")); } + [Fact] + public async Task RunInChildContextAsync_ResultOverThreshold_EmitsEmptyPayloadAndReplayChildren() + { + var (context, recorder, _, _) = CreateContext(); + var big = new string('y', 300 * 1024); + + var result = await context.RunInChildContextAsync( + async (_) => { await Task.Yield(); return big; }, + name: "phase"); + + Assert.Equal(big, result); // in-memory value intact for this invoke + + await recorder.Batcher.DrainAsync(); + + var succeed = recorder.Flushed.Single(o => + o.Type == "CONTEXT" && o.Action == "SUCCEED"); + Assert.Equal(string.Empty, succeed.Payload); + Assert.NotNull(succeed.ContextOptions); + Assert.True(succeed.ContextOptions.ReplayChildren); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayChildren_ReExecutesBodyWithoutRecheckpoint() + { + var childOpId = IdAt(1); // first root-level op + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = childOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "phase", + ContextDetails = new ContextDetails { Result = "", ReplayChildren = true } + } + } + }); + + var executed = false; + var result = await context.RunInChildContextAsync( + async (_) => { executed = true; await Task.Yield(); return "rebuilt"; }, + name: "phase"); + + Assert.True(executed); + Assert.Equal("rebuilt", result); + + await recorder.Batcher.DrainAsync(); + // Already-terminal child must not be re-checkpointed. + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + } + [Fact] public async Task RunInChildContextAsync_SubTypeAndName_PropagateToCheckpoint() { From c995c15b9b57d4be117d6d493db279cda5a42426 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:16:13 -0400 Subject: [PATCH 17/21] feat(durable): suppress terminal re-checkpoint on ChildContext overflow replay --- .../Internal/ChildContextOperation.cs | 16 +++++---- .../ChildContextOperationTests.cs | 34 +++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs index de8b9129e..46f7768e0 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -44,7 +44,8 @@ internal sealed class ChildContextOperation : DurableOperation private readonly ILambdaSerializer _serializer; private readonly Func _childContextFactory; private readonly bool _isVirtual; - private bool _suppressSuccessCheckpoint; + // Set once on overflow-replay re-execution; never reset. + private bool _suppressTerminalCheckpoint; public ChildContextOperation( string operationId, @@ -133,7 +134,7 @@ protected override Task ReplayAsync(Operation existing, CancellationToken can private Task ExecuteFuncNoCheckpoint(CancellationToken cancellationToken) { - _suppressSuccessCheckpoint = true; + _suppressTerminalCheckpoint = true; return ExecuteFunc(cancellationToken); } @@ -177,8 +178,11 @@ private async Task ExecuteFunc(CancellationToken cancellationToken) { // Virtual branches suppress the FAIL checkpoint but still propagate // the exception — the orchestrator records the failure inline on the - // parent payload. - if (!_isVirtual) + // parent payload. Overflow-replay re-execution also suppresses it: the + // op is already terminal (SUCCEEDED) in the store, so re-emitting a + // FAIL would corrupt that record (mirrors ReplayChildrenAsync, which + // never re-checkpoints). The exception still propagates below. + if (!_isVirtual && !_suppressTerminalCheckpoint) { await EnqueueAsync(new SdkOperationUpdate { @@ -202,10 +206,10 @@ await EnqueueAsync(new SdkOperationUpdate // Virtual branches suppress the SUCCEED checkpoint; the orchestrator // serializes the result inline on the parent payload instead. - // _suppressSuccessCheckpoint is set on overflow replay re-execution: the + // _suppressTerminalCheckpoint is set on overflow replay re-execution: the // child is already terminal in the store, so we re-run only to recover the // in-memory value and must NOT re-emit a SUCCEED. - if (!_isVirtual && !_suppressSuccessCheckpoint) + if (!_isVirtual && !_suppressTerminalCheckpoint) { var serialized = SerializeResult(result); // Overflow: result too large to checkpoint inline. Emit an empty diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs index f11254182..137f83ad8 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs @@ -537,6 +537,7 @@ public async Task RunInChildContextAsync_ReplayChildren_ReExecutesBodyWithoutRec Type = OperationTypes.Context, Status = OperationStatuses.Succeeded, Name = "phase", + // Result == "" matches the overflow emission (string.Empty). ContextDetails = new ContextDetails { Result = "", ReplayChildren = true } } } @@ -555,6 +556,39 @@ public async Task RunInChildContextAsync_ReplayChildren_ReExecutesBodyWithoutRec Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); } + [Fact] + public async Task RunInChildContextAsync_ReplayChildren_BodyThrows_DoesNotEmitFailCheckpoint() + { + var childOpId = IdAt(1); // first root-level op + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = childOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "phase", + // Result == "" matches the overflow emission (string.Empty). + ContextDetails = new ContextDetails { Result = "", ReplayChildren = true } + } + } + }); + + // The op is already terminal (SUCCEEDED). If the overflow re-run body + // throws, the recovery path must NOT re-checkpoint a CONTEXT FAIL over + // the already-SUCCEEDED record — but the exception still propagates. + await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.Yield(); throw new InvalidOperationException("nondeterministic re-run"); }, + name: "phase")); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "FAIL"); + } + [Fact] public async Task RunInChildContextAsync_SubTypeAndName_PropagateToCheckpoint() { From fa570820e0f8df9b7fab3ebdb245d68f09e7fb76 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:31:08 -0400 Subject: [PATCH 18/21] feat(durable): enforce CheckpointBatcher byte cap --- .../Internal/CheckpointBatcher.cs | 99 ++++++++++++++----- .../Internal/CheckpointBatcherConfig.cs | 14 ++- .../CheckpointBatcherTests.cs | 52 ++++++++++ 3 files changed, 133 insertions(+), 32 deletions(-) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs index 800d55bcf..022190da5 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs @@ -33,6 +33,34 @@ internal sealed class CheckpointBatcher : IAsyncDisposable private Exception? _terminalError; private int _disposed; + // Per-update wire-footprint estimate constants. Deliberate over-estimates: + // flushing slightly early is safe, flushing late risks a request-too-large. + private const int PerOpEnvelopeOverheadBytes = 512; + private const int StackFrameOverheadBytes = 8; + + /// + /// Cheap UTF-8 byte estimate of one update's wire footprint — variable string + /// fields plus a fixed envelope. No JSON is produced (AOT-safe). Payload is + /// counted at 2x because it is already-serialized JSON re-escaped as a string + /// value, which roughly doubles for escape-heavy content. + /// + private static int EstimateUpdateBytes(SdkOperationUpdate u) + { + var size = PerOpEnvelopeOverheadBytes; + if (u.Payload != null) size += System.Text.Encoding.UTF8.GetByteCount(u.Payload) * 2; + size += ByteCount(u.Id) + ByteCount(u.ParentId) + ByteCount(u.Name); + if (u.Error != null) + { + size += ByteCount(u.Error.ErrorType) + ByteCount(u.Error.ErrorMessage) + ByteCount(u.Error.ErrorData); + if (u.Error.StackTrace != null) + foreach (var line in u.Error.StackTrace) + size += ByteCount(line) + StackFrameOverheadBytes; + } + return size; + } + + private static int ByteCount(string? s) => s == null ? 0 : System.Text.Encoding.UTF8.GetByteCount(s); + public CheckpointBatcher( string? initialCheckpointToken, Func, CancellationToken, Task> flushAsync, @@ -113,25 +141,43 @@ public async ValueTask DisposeAsync() private async Task RunWorkerAsync(CancellationToken shutdownToken) { - // TODO: also enforce _config.MaxBatchBytes here. Today we only cap by - // operation count; an item whose serialized size pushes the batch over - // ~750 KB will be sent and rejected service-side. See CheckpointBatcherConfig. - var batch = new List(_config.MaxBatchOperations); + // Both caps are enforced: before adding an item that would push the batch + // over MaxBatchOperations OR MaxBatchBytes, the current batch is flushed. + // A lone item already over the byte cap is sent by itself (never loops). + // The byte accumulator is seeded with a fixed reserve covering the request + // prefix (checkpoint token + ARN + array framing) that the per-update + // estimate does not include. + const int RequestEnvelopeReserveBytes = 4 * 1024; + var batch = new PendingBatch(_config.MaxBatchOperations); + + async Task AddItemAsync(BatchItem item) + { + var itemBytes = EstimateUpdateBytes(item.Update); + if (batch.Count > 0 && + (batch.Count + 1 > _config.MaxBatchOperations || + RequestEnvelopeReserveBytes + batch.Bytes + itemBytes > _config.MaxBatchBytes)) + { + await FlushBatchAsync(batch.Items, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + + batch.Add(item); + + // Lone item already over the cap: send it alone, do not loop. + if (batch.Count == 1 && + RequestEnvelopeReserveBytes + batch.Bytes > _config.MaxBatchBytes) + { + await FlushBatchAsync(batch.Items, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } try { while (await _channel.Reader.WaitToReadAsync(shutdownToken).ConfigureAwait(false)) { - // Drain everything currently queued. while (_channel.Reader.TryRead(out var item)) - { - batch.Add(item); - if (batch.Count >= _config.MaxBatchOperations) - { - await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); - batch.Clear(); - } - } + await AddItemAsync(item).ConfigureAwait(false); // Optionally wait for late arrivals to coalesce into one batch. if (_config.FlushInterval > TimeSpan.Zero && batch.Count > 0) @@ -143,14 +189,7 @@ private async Task RunWorkerAsync(CancellationToken shutdownToken) while (await _channel.Reader.WaitToReadAsync(windowCts.Token).ConfigureAwait(false)) { while (_channel.Reader.TryRead(out var item)) - { - batch.Add(item); - if (batch.Count >= _config.MaxBatchOperations) - { - await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); - batch.Clear(); - } - } + await AddItemAsync(item).ConfigureAwait(false); } } catch (OperationCanceledException) when (!shutdownToken.IsCancellationRequested) @@ -161,7 +200,7 @@ private async Task RunWorkerAsync(CancellationToken shutdownToken) if (batch.Count > 0) { - await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + await FlushBatchAsync(batch.Items, shutdownToken).ConfigureAwait(false); batch.Clear(); } } @@ -179,9 +218,9 @@ private async Task RunWorkerAsync(CancellationToken shutdownToken) } finally { - // Anything left in the channel after the worker exits — fail it. + // Anything left in the batch/channel after the worker exits — fail it. var failure = Volatile.Read(ref _terminalError) ?? new ObjectDisposedException(nameof(CheckpointBatcher)); - foreach (var leftover in batch) + foreach (var leftover in batch.Items) leftover.Completion.TrySetException(failure); while (_channel.Reader.TryRead(out var item)) item.Completion.TrySetException(failure); @@ -214,5 +253,17 @@ private async Task FlushBatchAsync(IReadOnlyList batch, CancellationT } } + /// Accumulates a batch plus its estimated byte footprint so the two + /// never drift across the worker's add/flush/clear sites. + private sealed class PendingBatch + { + public readonly List Items; + public long Bytes; + public PendingBatch(int capacity) { Items = new List(capacity); } + public int Count => Items.Count; + public void Add(BatchItem item) { Items.Add(item); Bytes += EstimateUpdateBytes(item.Update); } + public void Clear() { Items.Clear(); Bytes = 0; } + } + private readonly record struct BatchItem(SdkOperationUpdate Update, TaskCompletionSource Completion); } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs index 88913e868..81dc85d45 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs @@ -22,15 +22,13 @@ internal sealed class CheckpointBatcherConfig public int MaxBatchOperations { get; init; } = 200; /// - /// Maximum batch size in bytes. Service-side limit is ~750 KB. + /// Maximum batch size in bytes. Service-side request limit is ~750 KB. /// /// - /// TODO: not enforced today. The worker only checks ; - /// a single oversized item (or a batch whose serialized size exceeds 750 KB) - /// will be sent to the service and rejected there. Wire this in alongside - /// the async-flush operations (Map / Parallel / child-context) since those - /// are the scenarios that can actually fill a batch — today every batch is - /// 1 item with = Zero, so the gap is latent. + /// Enforced by the worker: it flushes the current batch before adding an item + /// that would push the estimated request size over this cap, and sends a lone + /// item that already exceeds the cap by itself. The per-update estimate plus a + /// fixed request-prefix reserve approximate the real wire size conservatively. /// - internal int MaxBatchBytes { get; init; } = 750 * 1024; + public int MaxBatchBytes { get; init; } = 750 * 1024; } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs index effeb5804..d5e91ec37 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs @@ -172,6 +172,58 @@ public async Task EnqueueAsync_AfterDispose_Throws() await Assert.ThrowsAnyAsync(() => batcher.EnqueueAsync(Update("0-step"))); } + private static SdkOperationUpdate UpdateWithPayload(string id, int payloadBytes) => new() + { + Id = id, + Type = "CONTEXT", + Action = "SUCCEED", + Payload = new string('p', payloadBytes) + }; + + [Fact] + public async Task EnqueueAsync_ByteCap_SplitsBatchesByBytes() + { + var batchByteTotals = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + long sum = 0; + foreach (var o in ops) sum += o.Payload?.Length ?? 0; + batchByteTotals.Add(sum); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig + { + MaxBatchBytes = 10 * 1024, + FlushInterval = TimeSpan.FromMilliseconds(100) + }); + + // Three 6 KB payloads: at most one fits per 10 KB batch with overhead. + var tasks = Enumerable.Range(0, 3) + .Select(i => batcher.EnqueueAsync(UpdateWithPayload($"{i}", 6 * 1024))) + .ToArray(); + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + Assert.True(batchByteTotals.Count >= 2, "expected the byte cap to split into multiple batches"); + Assert.All(batchByteTotals, total => Assert.True(total <= 10 * 1024)); + } + + [Fact] + public async Task EnqueueAsync_SingleOversizedItem_SentAloneNoLoop() + { + var batches = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => { batches.Add(ops.Count); return Task.FromResult(token); }, + new CheckpointBatcherConfig { MaxBatchBytes = 4 * 1024 }); + + await batcher.EnqueueAsync(UpdateWithPayload("huge", 50 * 1024)); + await batcher.DrainAsync(); + + Assert.Single(batches); + Assert.Equal(1, batches[0]); + } + [Fact] public async Task CheckpointToken_UpdatesAfterEachFlush() { From 4d7c9e0d0d30c4994e531f5b2d8da9ee01d203ed Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:37:50 -0400 Subject: [PATCH 19/21] docs(durable): document ChildContext overflow replay branch --- .../Internal/ChildContextOperation.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs index 46f7768e0..c7472bcbf 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -25,6 +25,10 @@ namespace Amazon.Lambda.DurableExecution.Internal; /// and throw . /// SUCCEEDED: return cached deserialized result; user func is /// NOT re-executed. +/// SUCCEEDED (overflow): ReplayChildren=true + empty +/// payload (the result was too large to checkpoint inline) → re-run the +/// user func to recover the large result value; terminal checkpoints +/// (SUCCEED/FAIL) are suppressed since the op is already terminal. /// FAILED: throw with the /// recorded error; if is /// set, the mapped exception is thrown instead. From 38585c416f08f2e1b56549b12368573276a49d7f Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:39:24 -0400 Subject: [PATCH 20/21] docs(durable): note payload-size bound makes byte estimate int-safe --- .../Internal/CheckpointBatcher.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs index 022190da5..1937f6312 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs @@ -47,6 +47,8 @@ internal sealed class CheckpointBatcher : IAsyncDisposable private static int EstimateUpdateBytes(SdkOperationUpdate u) { var size = PerOpEnvelopeOverheadBytes; + // int arithmetic is safe: payloads are bounded by the 6MB Lambda + // invocation-payload cap, so the 2x multiply can never overflow a 32-bit int. if (u.Payload != null) size += System.Text.Encoding.UTF8.GetByteCount(u.Payload) * 2; size += ByteCount(u.Id) + ByteCount(u.ParentId) + ByteCount(u.Name); if (u.Error != null) From df9ac4d0ac1a436953a5ae3bf80dd727477a4dec Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Mon, 8 Jun 2026 12:46:29 -0400 Subject: [PATCH 21/21] test(durable): integration test for large-payload overflow replay --- .../ParallelFlatOverflowTest.cs | 161 ++++++++++++++++++ .../ParallelFlatOverflowFunction/Dockerfile | 7 + .../ParallelFlatOverflowFunction/Function.cs | 103 +++++++++++ .../ParallelFlatOverflowFunction.csproj | 18 ++ 4 files changed, 289 insertions(+) create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs new file mode 100644 index 000000000..21db02c6f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs @@ -0,0 +1,161 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFlatOverflowTest +{ + private readonly ITestOutputHelper _output; + public ParallelFlatOverflowTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Branch op ids + /// are SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way + /// under the branch op id. Reproduced locally because OperationIdGenerator is + /// internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end exercise of the LARGE-PAYLOAD OVERFLOW + ReplayChildren replay path + /// for a parallel. + /// + /// Three branches each return a deterministic ~150 KB string (~450 KB aggregate), + /// which exceeds the 256 KB checkpoint threshold, so the parallel OVERFLOWS: the SDK + /// checkpoints a STRIPPED summary (no inline results) and sets + /// ContextOptions.ReplayChildren=true on the parent CONTEXT op. + /// + /// The workflow is shaped to actually drive the RECOVERY path (ReplayChildrenAsync): + /// - invoke 1: branches suspend on their in-branch waits -> PENDING. + /// - invoke 2: the parallel re-runs the branches, overflow-checkpoints the parent + /// as SUCCEEDED + ReplayChildren, then suspends on the post-parallel + /// "post-overflow" wait (so the parallel does NOT also return in this invoke). + /// - invoke 3: re-enters the already-terminal SUCCEEDED + ReplayChildren parallel, + /// routing through ReplayChildrenAsync to RE-EXECUTE the branch bodies and + /// recover the stripped values (reading per-unit Status/CompletionReason from the + /// frozen summary, never re-checkpointing). The final result is computed from + /// those recovered values. + /// + /// This test proves the whole path works against the real durable-execution service: + /// 1. The execution SUCCEEDED — proving the overflow checkpoint was accepted AND + /// ReplayChildrenAsync correctly reconstructed the aggregate result. (If the + /// ReplayChildren recovery path were broken, reconstruction would fail and the + /// execution would FAIL/TIME_OUT.) + /// 2. Exactly ONE parent CONTEXT op exists — Flat emits no per-branch CONTEXT. + /// 3. The three "generate" steps succeeded and re-parent to the Parallel op. + /// 4. There were >= 3 InvocationCompleted events (initial PENDING + the resume that + /// overflow-checkpoints the parallel + the post-overflow resume that runs + /// ReplayChildrenAsync) — proving the parallel was re-entered while terminal, so + /// the ReplayChildren recovery path really ran. + /// 5. The FINAL execution result (read via GetExecutionAsync after SUCCEEDED, not + /// the first PENDING invoke response) reports the recovered per-branch lengths + /// ("153600" x3) and first chars ("abc") — proving the large deterministic + /// values were recovered EXACTLY by ReplayChildrenAsync, not lost or defaulted. + /// + [Fact] + public async Task Parallel_Flat_Overflow_ReplaysChildren_AndRecoversLargeResults() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFlatOverflowFunction"), + "pflow", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "po1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // SUCCEEDED alone proves the >256 KB overflow checkpoint was accepted and that + // ReplayChildrenAsync (re-entered on the post-overflow resume) reconstructed the + // result. A broken overflow recovery would FAIL or TIME_OUT here. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var branchOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each branch's "generate" step is the 1st inner op under that branch's own id + // space: SHA256("-1"). + var expectedStepIds = branchOpIds.Select(b => HashOpId($"{b}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three branches' inner step + + // wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 2. Exactly ONE CONTEXT operation exists — the parent Parallel op. No + // per-branch CONTEXT events under Flat (even on the overflow path). + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && branchOpIds.Contains(e.Id))); + + // 3. Each branch's "generate" step re-parents to the Parallel op (NOT to its + // virtual branch op), and the three step ids match the per-branch id space. + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. There are at least 3 invocations: the initial PENDING, the resume that + // overflow-checkpoints the parallel and suspends on the post-overflow wait, and + // the post-overflow resume that re-enters the already-terminal parallel and runs + // ReplayChildrenAsync. >= 3 proves the parallel was re-entered while terminal, so + // the ReplayChildren recovery path really ran (>= 2 alone would only prove a + // single suspend/resume cycle). + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 3, + $"Expected >= 3 InvocationCompleted events (initial + overflow-checkpoint resume + post-overflow ReplayChildren resume), got {invocations.Count}"); + + // 5. The FINAL execution result (NOT the first invoke response, which is PENDING + // because the branch waits suspend it) reports the recovered per-branch metadata. + // Each branch produced a 150 KB (153600-byte) string built from its branch char, + // so a correct ReplayChildrenAsync recovery yields lengths "153600,153600,153600" + // and first chars "abc". This proves the large values were recovered EXACTLY by + // the ReplayChildren path, not lost or defaulted. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Result); + Assert.Contains("\"Lengths\":\"153600,153600,153600\"", execution.Result, StringComparison.OrdinalIgnoreCase); + Assert.Contains("\"FirstChars\":\"abc\"", execution.Result, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs new file mode 100644 index 000000000..77b8e7b4d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs @@ -0,0 +1,103 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + // Each branch produces a ~150 KB string. Three branches => ~450 KB of inline + // results, comfortably over the 256 KB checkpoint threshold. This forces the + // FLAT parallel aggregate to OVERFLOW: the SDK checkpoints a stripped summary + // (no inline results) and sets ContextOptions.ReplayChildren=true on the parent + // CONTEXT op, keeping the full result in memory for the current invoke. + private const int BranchPayloadSize = 150 * 1024; // 153600 bytes + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches run under NestingType.Flat. Each branch generates a LARGE + // (~150 KB) string inside a step, then does an in-branch durable wait. The + // combined ~450 KB aggregate exceeds the 256 KB threshold, so the parallel + // OVERFLOWS: the SDK checkpoints a stripped summary (no inline per-branch + // results) + ReplayChildren=true on the parent CONTEXT op. + // + // To actually exercise the RECOVERY path (ReplayChildrenAsync), the + // already-overflowed parallel must be re-entered on a FRESH invoke while it is + // already terminal (SUCCEEDED + ReplayChildren). The in-branch waits alone are + // NOT enough: the resume invoke that overflow-checkpoints the parallel also + // immediately returns SUCCEEDED, so the parallel goes STARTED -> SUCCEEDED in a + // single invoke and ReplayChildrenAsync is never hit. So we add a durable wait + // AFTER ParallelAsync returns (the "post-overflow" wait below): the overflow + // invoke suspends on that wait, and the NEXT invoke re-enters the already- + // terminal overflowed parallel and routes through ReplayChildrenAsync to + // RE-EXECUTE the branch bodies and recover the stripped values (reading per-unit + // Status/CompletionReason from the frozen summary, never re-checkpointing). + // + // The branch values are built DETERMINISTICALLY from the branch character + // (NOT Guid/random/DateTime). This is critical: the value produced on the + // original execution must be IDENTICAL to the value produced on replay + // re-execution, so the test can prove the large values were recovered exactly + // rather than lost or defaulted. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", ctx => BranchAsync(ctx, 'a')), + new DurableBranch("b", ctx => BranchAsync(ctx, 'b')), + new DurableBranch("c", ctx => BranchAsync(ctx, 'c')), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + // Force another invocation so the already-overflowed parallel is re-entered + // (already SUCCEEDED + ReplayChildren) and replayed via ReplayChildrenAsync, + // which re-executes the branch bodies to recover the stripped >256 KB results. + await context.WaitAsync(TimeSpan.FromSeconds(1), name: "post-overflow"); + + // Compute the verifiable metadata AFTER the post-overflow wait: on the final + // invoke these results come from ReplayChildrenAsync's re-execution, which is + // exactly the recovery we want to prove survives. + var results = batch.GetResults().ToList(); + + // Keep the returned payload SMALL (well under the 6 MB Lambda response + // limit): do NOT echo the ~450 KB back. Instead return verifiable metadata + // proving the large values were recovered on replay: + // - Lengths: comma-joined per-branch result LENGTHS (e.g. "153600,153600,153600") + // - FirstChars: the first character of each recovered branch result, in order + // (e.g. "abc") — confirms each branch's deterministic content survived. + var lengths = string.Join(",", results.Select(r => r.Length)); + var firstChars = string.Concat(results.Select(r => r.Length > 0 ? r[0] : '?')); + + return new TestResult { Status = "completed", Lengths = lengths, FirstChars = firstChars }; + } + + private static async Task BranchAsync(IDurableContext ctx, char branchChar) + { + // Deterministic large payload: same branchChar => same string on original + // execution and on replay re-execution. ~150 KB per branch. + var large = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return new string(branchChar, BranchPayloadSize); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the (overflowed) parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return large; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Lengths { get; set; } public string? FirstChars { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + +