diff --git a/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json b/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json new file mode 100644 index 000000000..42a1cec69 --- /dev/null +++ b/.autover/changes/1086291e-5286-4ea4-b9c1-af4eb1d0314d.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Minor", + "ChangelogMessages": [ + "Implement NestingType.Flat for ParallelAsync and MapAsync (previously threw NotSupportedException). Under Flat, each branch/item runs in a virtual context that emits no per-branch CONTEXT checkpoint; per-branch results and errors are recorded inline on the parent operation's payload, reducing checkpoint volume. Operations inside a flat branch (steps, waits) still checkpoint, re-parented to the parallel/map operation. NestingType.Nested remains the default." + ] + } + ] +} diff --git a/.autover/changes/durable-mapasync.json b/.autover/changes/durable-mapasync.json new file mode 100644 index 000000000..412e09055 --- /dev/null +++ b/.autover/changes/durable-mapasync.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add `MapAsync` to `IDurableContext` for processing a collection in parallel with one child context per item and automatic checkpointing. Supports configurable max concurrency, completion policy, and per-item naming via `MapConfig`, returning an `IBatchResult`." + ] + } + ] +} diff --git a/.autover/changes/durable-parallelasync.json b/.autover/changes/durable-parallelasync.json new file mode 100644 index 000000000..2adf78331 --- /dev/null +++ b/.autover/changes/durable-parallelasync.json @@ -0,0 +1,11 @@ +{ + "Projects": [ + { + "Name": "Amazon.Lambda.DurableExecution", + "Type": "Patch", + "ChangelogMessages": [ + "Add `ParallelAsync` to `IDurableContext` for running multiple workflow branches concurrently with automatic checkpointing. Supports configurable max concurrency, failure tolerance, and first-successful completion via `ParallelConfig`, returning an `IBatchResult`." + ] + } + ] +} diff --git a/Docs/durable-execution-design.md b/Docs/durable-execution-design.md index 59ced6a15..5fc728c98 100644 --- a/Docs/durable-execution-design.md +++ b/Docs/durable-execution-design.md @@ -559,7 +559,7 @@ For better observability, you can name individual branches (matching the JS SDK ```csharp // Named branches for easier debugging and testing var results = await context.ParallelAsync( - new NamedBranch[] + new DurableBranch[] { new("fetch_user", async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId))), new("fetch_orders", async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId))), @@ -1357,22 +1357,21 @@ public class MapConfig public int? MaxConcurrency { get; set; } /// - /// When to consider the operation complete. + /// When to consider the operation complete. Defaults to AllCompleted() — + /// every item runs regardless of per-item failures, which surface via + /// IBatchResult<T>.Failed rather than throwing. This permissive default + /// matches the Python and Java SDKs' map operation. It differs intentionally + /// from ParallelConfig.CompletionConfig, which defaults to AllSuccessful() + /// (fail-fast). For fail-fast map behavior, set this to + /// CompletionConfig.AllSuccessful() or call IBatchResult<T>.ThrowIfError(). /// - public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted(); /// /// How item branches are represented in the checkpoint graph. /// public NestingType NestingType { get; set; } = NestingType.Nested; - /// - /// Optional batching configuration for grouping items before processing. - /// When set, items are grouped into batches and each batch is processed as a unit. - /// Reduces checkpoint overhead for large collections. - /// - public ItemBatcher? Batcher { get; set; } - /// /// Optional function to generate a custom name for each item's branch. /// Improves observability in execution traces. Receives the item and its index. @@ -1381,23 +1380,6 @@ public class MapConfig public Func? ItemNamer { get; set; } } -/// -/// Groups items into batches for map operations to reduce checkpoint overhead. -/// At least one of MaxItemsPerBatch or MaxBytesPerBatch must be set. -/// -public class ItemBatcher -{ - /// - /// Maximum number of items per batch. Null = no count limit. - /// - public int? MaxItemsPerBatch { get; set; } - - /// - /// Maximum serialized size (bytes) per batch. Null = no size limit. - /// - public int? MaxBytesPerBatch { get; set; } -} - /// /// Defines completion criteria for parallel/map operations. /// @@ -1405,6 +1387,13 @@ public class CompletionConfig { public int? MinSuccessful { get; set; } public int? ToleratedFailureCount { get; set; } + /// + /// Maximum tolerated failure ratio, expressed as a value in the range + /// 0.0 to 1.0 (inclusive). For example, 0.25 means + /// "tolerate up to 25% failures; fail when the failure ratio strictly + /// exceeds 25%". null = no ratio-based threshold. Validated by the + /// setter; out-of-range values throw . + /// public double? ToleratedFailurePercentage { get; set; } public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; @@ -2122,7 +2111,6 @@ All four SDKs expose the same core operations. The differences are naming conven | Jitter strategy | `JitterStrategy` enum on `Exponential()` | `jitter_strategy` on `RetryStrategyConfig` | `jitter` on `createRetryStrategy()` | | Retry presets | `RetryStrategy.None/Default/Transient` | `RetryPresets.none()/default()/transient()` | `retryPresets.default/linear/noRetry` | | Nesting type | `NestingType` on `ParallelConfig`/`MapConfig` | `NestingType` on parallel/map config | `NestingType` on parallel/map config | -| Item batching | `ItemBatcher` on `MapConfig` | `ItemBatcher` on `MapConfig` | *(checkpoint manager handles batching)* | | Item namer | `ItemNamer` on `MapConfig` | Item naming function on `MapConfig` | `itemNamer` on `MapConfig` | | Error mapping | `ErrorMapping` on `ChildContextConfig` | *(typed exception wrapping)* | `errorMapping` on child context config | | Message-based retry filter | `retryableMessagePatterns` (regex) | `retryable_errors` (regex) | `retryableErrors` (RegExp[]) | diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs new file mode 100644 index 000000000..fdba62d64 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/BatchItemStatus.cs @@ -0,0 +1,31 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Status of an individual item in a . +/// +/// +/// Mirrors the wire-state of the per-branch checkpoint at the moment the batch +/// resolved. Items that finished produce or +/// ; items that were not dispatched because a +/// short-circuit fired are reported as +/// . +/// +public enum BatchItemStatus +{ + /// + /// The branch ran to completion and produced a result. + /// + Succeeded, + + /// + /// The branch ran to completion and threw. + /// + Failed, + + /// + /// The branch was not dispatched before the batch's + /// resolved (e.g., short-circuited + /// before this branch was started), or no per-branch checkpoint exists on replay. + /// + Started +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md b/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md new file mode 100644 index 000000000..b825300bd --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CLAUDE.md @@ -0,0 +1,151 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What this is + +`Amazon.Lambda.DurableExecution` is the .NET SDK (preview, 0.x) for resilient, long-running AWS Lambda +workflows that checkpoint progress after each step and resume after failures or waits. A workflow can run +for up to ~1 year (the WAIT cap is 31,622,400 seconds) and is only billed for active compute. The SDK is +client-side glue: the *durable execution service* (part of Lambda) owns the checkpoint store, fires timers, +and re-invokes the function; this library re-derives in-memory workflow position from the checkpoint history +the service sends on each invocation. See sibling SDKs (Python/JS/Java) listed in `README.md` for the shared +model — this SDK deliberately mirrors their semantics. + +## Build & test + +Targets `net8.0;net10.0` (`DefaultPackageTargets` in `buildtools/common.props`). `TreatWarningsAsErrors` is on +everywhere, and the main library is `IsTrimmable` with the trim analyzer enabled — keep new code AOT/trim-clean. + +```bash +# Build the library (run from this directory) +dotnet build + +# Unit tests (fast, no AWS). Project: Libraries/test/Amazon.Lambda.DurableExecution.Tests +dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj + +# A single test +dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj \ + --filter "FullyQualifiedName~StepOperationTests" + +# Coverage report (requires reportgenerator tool) +../../test/Amazon.Lambda.DurableExecution.Tests/coverage.sh +``` + +Unit tests reach `internal` types via `InternalsVisibleTo` (declared in the `.csproj`). They use +`Amazon.Lambda.TestUtilities` (`TestLambdaContext`) and the real `SourceGeneratorLambdaJsonSerializer` — +set `TestLambdaContext.Serializer` so `LambdaSerializerHelper.GetRequired` finds one. + +### Integration tests (expensive, real AWS) + +`Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests` deploys real Lambdas. Each test builds a +`TestFunctions//` project into a container image via **`dotnet publish` + `docker build`**, pushes to ECR, +creates an IAM role + Lambda (`DurableFunctionDeployment`), invokes it, and tears everything down on dispose. +Requires Docker, AWS creds (us-east-1), and is slow. Every behavior in `docs/` should have a paired +integration test under that project. Prefix AWS commands with `unset AWS_PROFILE` to use `[default]` creds. + +**Run integration tests against `net10.0`.** The project multi-targets `net8.0;net10.0`; `dotnet test` +without a framework spins up one testhost per TFW and runs them concurrently, which races two processes on +the same `TestFunctions//` build dir. Pin the framework: + +```bash +dotnet test ../../test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj \ + -f net10.0 --filter "FullyQualifiedName~MultipleStepsTest" +``` + +## Architecture: the replay model + +This is the part you must understand before changing anything. Read these together: +`DurableFunction.cs`, `DurableExecutionHandler.cs`, `DurableContext.cs`, `Internal/DurableOperation.cs`, +`Internal/ExecutionState.cs`, `Internal/OperationIdGenerator.cs`, `Internal/TerminationManager.cs`. + +**Entry point.** The user's Lambda handler delegates to `DurableFunction.WrapAsync`, which: +hydrates `ExecutionState` from `invocationInput.InitialExecutionState` (paging the service via `NextMarker`), +extracts the user payload from the `EXECUTION`-type op, builds a `CheckpointBatcher` + `DurableContext`, runs +the workflow through `DurableExecutionHandler.RunAsync`, drains checkpoints, and maps the result to a +`DurableExecutionInvocationOutput` with status **Succeeded / Failed / Pending**. + +**Each operation runs the same workflow code every invocation.** There is no persisted program counter. +On re-invocation the user function executes from the top again; each durable call (`StepAsync`, `WaitAsync`, +etc.) looks up its own checkpoint and either replays the cached result or runs fresh. This is why workflow +code **must be deterministic** — same operations, same order, same names across deployments. + +**Deterministic operation IDs** (`OperationIdGenerator`). Each durable call gets an ID = SHA-256 of +`"-"`, where the counter is per-context and pre-incremented. The same workflow position +yields the same opaque ID across replays, so a checkpoint correlates to a call by *position*, not by name — +renaming a step does **not** break replay (the human name rides separately on `OperationUpdate.Name`). +Reordering or adding/removing calls *does* break it. `ValidateReplayConsistency` enforces this and throws +`NonDeterministicExecutionException` on type/name drift. + +**Suspension is implemented by never completing a Task** (`TerminationManager` + `DurableExecutionHandler`). +When an op must suspend (wait timer, scheduled retry, pending callback/invoke) it calls +`Termination.SuspendAndAwait()`, which trips a one-shot signal and returns a Task that *never resolves*. +`RunAsync` runs the user code via `Task.Run` and races it against `TerminationTask` with `Task.WhenAny`: +- user task wins → **Succeeded** (or **Failed** if it threw) +- termination wins → **Pending**; the abandoned user task is GC'd, checkpoints flush, the service fires the + timer and re-invokes. On replay the suspended op sees its now-terminal checkpoint and returns normally. + +**Operation classes** (`Internal/*Operation.cs`) all extend `DurableOperation`. The base's +`ExecuteAsync` does: `ValidateReplayConsistency` → `TrackReplay` → look up checkpoint → dispatch to +`StartAsync` (no prior checkpoint) or `ReplayAsync` (checkpoint exists). `StepOperation` is the canonical +example — read its class doc comment for the full status decision table (Succeeded→cached, Failed→rethrow, +Pending→re-suspend if retry timer hasn't fired, Started→crash-recovery under `AtMostOncePerRetry`, +Ready→run next attempt). `DurableContext` is a thin dispatcher: it allocates the op ID, pulls the serializer +off `ILambdaContext.Serializer`, constructs the right `*Operation`, and calls `ExecuteAsync`. + +**Checkpointing** (`CheckpointBatcher`). Outbound `OperationUpdate`s (START/SUCCEED/FAIL/RETRY) are enqueued +to a background channel worker that batches and flushes them via `LambdaDurableServiceClient` (which wraps +the `AWSSDK.Lambda` `Checkpoint`/`GetExecutionState` calls). `EnqueueAsync` awaits its batch's flush +(sync semantics); fire-and-forget callers (e.g. the START checkpoint under the default +`AtLeastOncePerRetry`) don't await but must observe the Task's exception. Flush errors become a terminal +error rethrown by the next `EnqueueAsync`/`DrainAsync`. `DurableFunction.IsTerminalCheckpointError` +classifies SDK errors on the final drain: 4xx (except 429 and stale-token) → **Failed** envelope; 429/5xx/ +network → let it escape so Lambda retries the whole invocation. + +**Replay-mode tracking** (`ExecutionState`). `IsReplaying` starts true iff any completed non-`EXECUTION` op +exists; `TrackReplay` decrements as each is visited and flips to false once the workflow catches up to the +frontier. `ReplayAwareLogger` uses this to suppress log lines emitted during replay so a 30-step workflow +re-invoked 30 times logs each line once — **always use `ctx.Logger`**, never `Console.WriteLine`. +`ExecutionState` is lock-guarded because the batcher worker thread and concurrent parallel/map branches all +touch it. + +### Operations surface (`IDurableContext`) + +`StepAsync` (checkpointed code + retries), `WaitAsync` (1s–~1yr timer), `RunInChildContextAsync` (isolated +sub-workflow checkpointed as one `CONTEXT` op), `CreateCallbackAsync` / `WaitForCallbackAsync` (external +events; `WaitForCallback` is *composed* from child-context + callback + submitter step — see +`DurableContext.RunWaitForCallback`), `InvokeAsync` (durable-to-durable chained invoke, qualified ARN +required), and `ParallelAsync` / `MapAsync` (concurrent branches → `IBatchResult`). + +**Nesting (`NestingType`)** matters for parallel/map. `Nested` (default) gives each branch a full `CONTEXT` +checkpoint. `Flat` runs branches in *virtual* contexts that emit no `CONTEXT` op — inner ops re-parent to the +parallel/map op via `OperationIdGenerator.CreateVirtualChild(operationId, reportedParentId)`, trading trace +granularity for fewer checkpoints. The `idPrefix` vs `reportedParentId` split is the subtle part: inner IDs +always derive from the branch's own op ID (so siblings never collide), but are *reported* under the nearest +non-virtual ancestor (so they reference a parent that actually exists in the checkpoint store). + +### Wire format (`Operation.cs`) + +`Operation` and its `*Details` types mirror the service envelope JSON exactly (`[JsonPropertyName]`). +String constants live in `OperationTypes` (STEP/WAIT/CALLBACK/CHAINED_INVOKE/CONTEXT/EXECUTION), +`OperationStatuses` (STARTED/SUCCEEDED/FAILED/PENDING/READY/CANCELLED/STOPPED/TIMED_OUT), and +`OperationSubTypes` (PascalCase finer classifier). Plural type names (`OperationTypes`, not `OperationType`) +intentionally avoid collision with `AWSSDK.Lambda` model enums. + +## Conventions + +- **Programming model:** preview supports only the *executable* model — `Main` builds a `LambdaBootstrap` + with a handler wrapper and an `ILambdaSerializer`. The serializer is read off `ILambdaContext.Serializer` + (a preview API; the project-wide `AWSLAMBDA001` suppression in the `.csproj` is intentional for that + reason). All step/result/payload (de)serialization flows through that one registered serializer, so AOT + and reflection callers share a single code path — there is no per-call `JsonSerializerContext` argument. +- **Errors:** durable exceptions carry `ErrorType`/`ErrorData`/`OriginalStackTrace` so a failure can be + reconstructed on replay when the live exception object is gone. `StepException`, `ChildContextException`, + `CallbackFailedException`/`CallbackTimeoutException`/`CallbackSubmitterException`, `ParallelException`, + `MapException`, and `NonDeterministicExecutionException` all derive from `DurableExecutionException`. + When adding error-mapping logic, handle *both* the fresh path (`InnerException` is the live exception) and + the replay path (`InnerException` is null, `ErrorType` carries the type string) — see + `DurableContext.MapWaitForCallbackException` for the pattern. +- **Public config types** (`StepConfig`, `WaitForCallbackConfig`, `ParallelConfig`, `MapConfig`, + `CompletionConfig`, etc.) are nullable optional args; resolve to an effective config inside the dispatcher. +- Inclusive language is enforced repo-wide (see the user's global rules): no master/slave, whitelist/blacklist. diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs new file mode 100644 index 000000000..b31873f67 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionConfig.cs @@ -0,0 +1,111 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Defines completion criteria for parallel/map operations. +/// +/// +/// Construct via the static factories (, +/// , ) or set the +/// individual properties directly. Multiple criteria combine: the operation +/// resolves as soon as any criterion is met (success short-circuit) or violated +/// (failure short-circuit). +/// +public sealed class CompletionConfig +{ + private int? _minSuccessful; + private int? _toleratedFailureCount; + private double? _toleratedFailurePercentage; + + /// + /// Minimum number of items required + /// before the operation resolves successfully. null = no minimum. + /// + /// + /// Thrown by the setter if the value is less than 1. A minimum of + /// zero (or negative) would resolve the operation immediately without + /// dispatching any branch. + /// + public int? MinSuccessful + { + get => _minSuccessful; + set + { + if (value is { } v && v < 1) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MinSuccessful must be at least 1."); + } + _minSuccessful = value; + } + } + + /// + /// Maximum tolerated count. When the + /// failure count strictly exceeds this value, the operation resolves + /// with . + /// null = no count-based failure threshold. + /// + /// + /// Thrown by the setter if the value is negative. A negative tolerance + /// would fail the operation immediately without dispatching any branch. + /// + public int? ToleratedFailureCount + { + get => _toleratedFailureCount; + set + { + if (value is { } v && v < 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "ToleratedFailureCount must be zero or greater."); + } + _toleratedFailureCount = value; + } + } + + /// + /// Maximum tolerated failure ratio, expressed as a value in the range + /// 0.0 to 1.0 (inclusive). For example, 0.25 means + /// "tolerate up to 25% failures; fail when the failure ratio strictly + /// exceeds 25%". null = no ratio-based failure threshold. + /// + /// + /// Thrown by the setter if the value is outside [0.0, 1.0]. + /// + public double? ToleratedFailurePercentage + { + get => _toleratedFailurePercentage; + set + { + if (value is { } v && (v < 0.0 || v > 1.0)) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "ToleratedFailurePercentage must be a ratio in [0.0, 1.0]."); + } + _toleratedFailurePercentage = value; + } + } + + /// + /// All items must succeed. Equivalent to + /// = 0. The default for + /// . + /// + public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 }; + + /// + /// Run every branch regardless of failures; surface failures per-item via + /// . Resolution does not auto-throw — + /// the caller can inspect the result and call + /// if they want strict-success + /// behavior. + /// + public static CompletionConfig AllCompleted() => new(); + + /// + /// Resolve once at least one branch has succeeded. Branches that were not + /// dispatched before the completion criteria was met are reported as + /// . + /// + public static CompletionConfig FirstSuccessful() => new() { MinSuccessful = 1 }; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs new file mode 100644 index 000000000..ed40a1fc8 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/CompletionReason.cs @@ -0,0 +1,29 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Why a batch operation ( +/// or future Map) resolved. +/// +public enum CompletionReason +{ + /// + /// Every branch finished — no short-circuit + /// was triggered. Branches may be a mix of + /// and . + /// + AllCompleted, + + /// + /// branches succeeded; remaining + /// branches were left in . + /// + MinSuccessfulReached, + + /// + /// or + /// was exceeded. + /// The batch is considered failed and surfaces a + /// when awaited. + /// + FailureToleranceExceeded +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs new file mode 100644 index 000000000..c6e1cb6f0 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableBranch.cs @@ -0,0 +1,13 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// A named branch for +/// . +/// Names appear in execution traces and on the wire OperationUpdate.Name +/// field, and surface on . +/// +/// The branch's result type. +/// Human-readable branch name. Required. +/// The user function executed inside the branch's +/// child context. +public sealed record DurableBranch(string Name, Func> Func); diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs index ee5b1d1e6..54e30754a 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableContext.cs @@ -144,16 +144,8 @@ private Task RunChildContext( var operationId = _idGenerator.NextId(); - // Capture this DurableContext's collaborators; the child shares state, - // termination, batcher, ARN, and Lambda context — but uses a child - // OperationIdGenerator so its operation IDs are deterministically - // namespaced under the parent op ID. - IDurableContext ChildFactory(string parentOpId) => new DurableContext( - _state, _terminationManager, _idGenerator.CreateChild(parentOpId), - _durableExecutionArn, LambdaContext, _batcher); - var op = new ChildContextOperation( - operationId, name, _idGenerator.ParentId, func, config, serializer, ChildFactory, + operationId, name, _idGenerator.ParentId, func, config, serializer, MakeChildFactory(), _state, _terminationManager, _durableExecutionArn, _batcher); return op.ExecuteAsync(cancellationToken); } @@ -178,6 +170,98 @@ private Task> RunCallback( return op.ExecuteAsync(cancellationToken); } + public Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default) + => RunParallel(WrapToDurableBranches(branches), name, config, cancellationToken); + + public Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default) + => RunParallel(branches, name, config, cancellationToken); + + private static IReadOnlyList> WrapToDurableBranches( + IReadOnlyList>> branches) + { + if (branches == null) throw new ArgumentNullException(nameof(branches)); + + var result = new DurableBranch[branches.Count]; + for (var i = 0; i < branches.Count; i++) + { + var func = branches[i]; + if (func == null) + throw new ArgumentException($"Branch at index {i} is null.", nameof(branches)); + // Default name is the index — surfaces in execution traces and on + // IBatchItem.Name. Users wanting custom names use the + // DurableBranch overload. + result[i] = new DurableBranch(i.ToString(System.Globalization.CultureInfo.InvariantCulture), func); + } + return result; + } + + private Task> RunParallel( + IReadOnlyList> branches, + string? name, + ParallelConfig? config, + CancellationToken cancellationToken) + { + if (branches == null) throw new ArgumentNullException(nameof(branches)); + for (var i = 0; i < branches.Count; i++) + { + if (branches[i] == null) + throw new ArgumentException($"Branch at index {i} is null.", nameof(branches)); + if (branches[i].Func == null) + throw new ArgumentException($"Branch at index {i} has a null Func.", nameof(branches)); + } + + var effectiveConfig = config ?? new ParallelConfig(); + + var serializer = LambdaContext.Serializer + ?? throw new InvalidOperationException( + "No ILambdaSerializer is registered on ILambdaContext.Serializer. " + + "Register a serializer via LambdaBootstrapBuilder.Create(handler, serializer) " + + "(or in tests, set TestLambdaContext.Serializer)."); + + var operationId = _idGenerator.NextId(); + var op = new Internal.ParallelOperation( + operationId, name, _idGenerator.ParentId, branches, effectiveConfig, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + + public Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default) + => RunMap(items, func, name, config, cancellationToken); + + private Task> RunMap( + IReadOnlyList items, + Func, Task> func, + string? name, + MapConfig? config, + CancellationToken cancellationToken) + { + if (items == null) throw new ArgumentNullException(nameof(items)); + if (func == null) throw new ArgumentNullException(nameof(func)); + + var effectiveConfig = config ?? new MapConfig(); + + var serializer = LambdaSerializerHelper.GetRequired(LambdaContext); + + var operationId = _idGenerator.NextId(); + var op = new Internal.MapOperation( + operationId, name, _idGenerator.ParentId, items, func, effectiveConfig, serializer, MakeChildFactory(), + _state, _terminationManager, _durableExecutionArn, _batcher); + return op.ExecuteAsync(cancellationToken); + } + public Task WaitForCallbackAsync( Func submitter, string? name = null, @@ -390,6 +474,42 @@ private Task RunInvoke( _state, _terminationManager, _durableExecutionArn, _batcher); return op.ExecuteAsync(cancellationToken); } + + /// + /// Builds the factory used by (and + /// each branch) to construct + /// the inner . The child shares state, + /// termination, batcher, ARN, and Lambda context — but uses a child + /// so its operation IDs are + /// deterministically namespaced under the parent op ID. + /// + /// + /// Builds the factory each operation uses to create the inner + /// its user function runs against. + /// + /// + /// The delegate takes (operationId, reportedParentId, isVirtual): + /// + /// isVirtual == false (the default child-context case): the + /// inner context's ID space and reported parent both root at + /// operationId via ; + /// reportedParentId is ignored. + /// isVirtual == true (a branch): + /// inner-op IDs still root at operationId (so sibling branches + /// never collide), but inner ops report reportedParentId — the + /// parallel/map operation — as their parent, since the virtual branch + /// emits no CONTEXT checkpoint to reference. + /// + /// + private Func MakeChildFactory() + { + return (operationId, reportedParentId, isVirtual) => new DurableContext( + _state, _terminationManager, + isVirtual + ? _idGenerator.CreateVirtualChild(operationId, reportedParentId) + : _idGenerator.CreateChild(operationId), + _durableExecutionArn, LambdaContext, _batcher); + } } internal sealed class WaitForCallbackContext : IWaitForCallbackContext diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs index 7f8707966..e4748b381 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/DurableExecutionException.cs @@ -98,3 +98,69 @@ public ChildContextException(string message) : base(message) { } /// Creates a wrapping an inner exception. public ChildContextException(string message, Exception innerException) : base(message, innerException) { } } + +/// +/// Thrown when a parallel operation resolves with +/// . The aggregate +/// is preserved on so callers +/// can inspect per-branch outcomes. +/// +/// +/// This is the base type for parallel failures. Subclasses may be added in +/// future releases (for example, a dedicated +/// ParallelFailureToleranceExceededException); catching +/// remains forward-compatible. +/// +public class ParallelException : DurableExecutionException +{ + /// + /// The aggregate result of the parallel operation. Type-erased — cast to + /// IBatchResult<T> if the per-branch result type is known. + /// + public IBatchResult? Result { get; init; } + + /// + /// Why the parallel operation resolved. + /// + public CompletionReason CompletionReason { get; init; } + + /// Creates an empty . + public ParallelException() { } + /// Creates a with the given message. + public ParallelException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public ParallelException(string message, Exception innerException) : base(message, innerException) { } +} + +/// +/// Thrown when a map operation resolves with +/// . The aggregate +/// is preserved on so callers +/// can inspect per-item outcomes. +/// +/// +/// This is the base type for map failures. Subclasses may be added in future +/// releases; catching remains forward-compatible. +/// A dedicated type (rather than reusing ) lets +/// callers pattern-match which concurrent operation failed. +/// +public class MapException : DurableExecutionException +{ + /// + /// The aggregate result of the map operation. Type-erased — cast to + /// IBatchResult<T> if the per-item result type is known. + /// + public IBatchResult? Result { get; init; } + + /// + /// Why the map operation resolved. + /// + public CompletionReason CompletionReason { get; init; } + + /// Creates an empty . + public MapException() { } + /// Creates a with the given message. + public MapException(string message) : base(message) { } + /// Creates a wrapping an inner exception. + public MapException(string message, Exception innerException) : base(message, innerException) { } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs new file mode 100644 index 000000000..62814fd62 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchItem.cs @@ -0,0 +1,38 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// One item inside an — the outcome of a single +/// branch (parallel) or item (map). +/// +/// The branch/item result type. +public interface IBatchItem +{ + /// + /// Zero-based position in the original branches/items list. Stable across + /// replays. + /// + int Index { get; } + + /// + /// Optional human-readable name for this branch/item. + /// Surfaces on the wire OperationUpdate.Name field for observability. + /// + string? Name { get; } + + /// + /// Status of this item at the moment the batch resolved. + /// + BatchItemStatus Status { get; } + + /// + /// The branch/item result. Populated only when is + /// . + /// + T? Result { get; } + + /// + /// The branch/item failure. Populated only when is + /// . + /// + DurableExecutionException? Error { get; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs new file mode 100644 index 000000000..90d7e14b7 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IBatchResult.cs @@ -0,0 +1,90 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Non-generic marker for . Used by +/// so callers can hold a reference to +/// the aggregate result without knowing the per-branch type at compile time. +/// +public interface IBatchResult +{ + /// + /// Why the batch resolved. + /// + CompletionReason CompletionReason { get; } + + /// True if any item is in . + bool HasFailure { get; } + + /// Number of items in . + int SuccessCount { get; } + + /// Number of items in . + int FailureCount { get; } + + /// Number of items in . + int StartedCount { get; } + + /// Total number of items. + int TotalCount { get; } +} + +/// +/// Result of a parallel (and future map) operation. Aggregates the per-branch +/// outcomes, completion bookkeeping, and convenience accessors. +/// +/// The per-branch/per-item result type. +/// +/// The result is reconstructed from per-branch checkpoints — the aggregate is +/// never serialized as a single blob in user T. Per-branch results live on +/// ParallelBranch child-context checkpoints; this type assembles them. +/// +public interface IBatchResult : IBatchResult +{ + /// + /// All items, in original index order. + /// + IReadOnlyList> All { get; } + + /// + /// Items whose is + /// , in original index order. + /// + IReadOnlyList> Succeeded { get; } + + /// + /// Items whose is + /// , in original index order. + /// + IReadOnlyList> Failed { get; } + + /// + /// Items that were not dispatched when the batch resolved (a + /// short-circuit fired before they were started), + /// in original index order. + /// + IReadOnlyList> Started { get; } + + /// + /// Returns the results of every successful item, in original index order. + /// + /// + /// Items in or are skipped — this + /// method never throws on partial-failure batches. Use + /// if you want a strict-success accessor. + /// + IReadOnlyList GetResults(); + + /// + /// Returns the errors for every failed item, in original index order. + /// + IReadOnlyList GetErrors(); + + /// + /// Throws the first failed item's if any + /// item failed; no-op otherwise. + /// + /// + /// The first failed item's error. + /// + void ThrowIfError(); +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs index 5904f84e4..a031120fd 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/IDurableContext.cs @@ -177,6 +177,75 @@ Task InvokeAsync( string? name = null, InvokeConfig? config = null, CancellationToken cancellationToken = default); + + /// + /// Execute multiple branches concurrently. Each branch runs inside its own + /// child context; per-branch results are aggregated into an + /// . Branches are dispatched up to + /// ; the aggregate resolves + /// according to . + /// + /// + /// On per-branch failure (a branch's user function throws), the failure is + /// captured on the corresponding instead of + /// aborting the parallel. The parallel only throws + /// when + /// criteria are violated. Use + /// for explicit strict-success + /// semantics. Per-branch results are serialized to checkpoints using the + /// registered on + /// (typically configured via + /// LambdaBootstrapBuilder.Create(handler, serializer)). + /// + Task> ParallelAsync( + IReadOnlyList>> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Execute multiple named branches concurrently. Names appear in execution + /// traces and on . + /// + /// + /// Per-branch results are serialized to checkpoints using the + /// registered on + /// . + /// + Task> ParallelAsync( + IReadOnlyList> branches, + string? name = null, + ParallelConfig? config = null, + CancellationToken cancellationToken = default); + + /// + /// Process a collection of items concurrently, running + /// once per item. Each item runs inside its own child context; per-item + /// results are aggregated into an . Items + /// are dispatched up to ; the aggregate + /// resolves according to . + /// + /// + /// The per-item function receives the durable context, the item, its + /// zero-based index, and the full source list (matching the Python and + /// JavaScript SDKs). On per-item failure (the user function throws), the + /// failure is captured on the corresponding + /// instead of aborting the map. By default + /// () every item runs and failures + /// surface via ; the map throws + /// only when + /// criteria are violated. Use + /// for explicit + /// strict-success semantics. Per-item results are serialized to checkpoints + /// using the registered on + /// . + /// + Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); } /// diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs new file mode 100644 index 000000000..5c9dda77c --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchItem.cs @@ -0,0 +1,15 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default implementation produced by +/// when assembling the +/// . +/// +internal sealed class BatchItem : IBatchItem +{ + public required int Index { get; init; } + public required string? Name { get; init; } + public required BatchItemStatus Status { get; init; } + public T? Result { get; init; } + public DurableExecutionException? Error { get; init; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs new file mode 100644 index 000000000..db97f02c1 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchJsonContext.cs @@ -0,0 +1,16 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// AOT-friendly for the internal +/// payload stored on a concurrent operation's parent +/// CONTEXT checkpoint (parallel or map). Only this internal type — never user T — +/// flows through here, so the source-generated metadata is sufficient. +/// +[JsonSerializable(typeof(BatchSummary))] +[JsonSerializable(typeof(BatchUnitSummary))] +[JsonSerializable(typeof(ErrorObject))] +internal sealed partial class BatchJsonContext : JsonSerializerContext +{ +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs new file mode 100644 index 000000000..362303a0e --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchResult.cs @@ -0,0 +1,80 @@ +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Default implementation. Computes derived views +/// ( / / ) +/// eagerly so consumers don't pay for re-filtering on every access. +/// +internal sealed class BatchResult : IBatchResult +{ + public BatchResult(IReadOnlyList> all, CompletionReason completionReason) + { + All = all; + CompletionReason = completionReason; + + var succeeded = new List>(); + var failed = new List>(); + var started = new List>(); + + foreach (var item in all) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded.Add(item); break; + case BatchItemStatus.Failed: failed.Add(item); break; + case BatchItemStatus.Started: started.Add(item); break; + } + } + + Succeeded = succeeded; + Failed = failed; + Started = started; + } + + public IReadOnlyList> All { get; } + public IReadOnlyList> Succeeded { get; } + public IReadOnlyList> Failed { get; } + public IReadOnlyList> Started { get; } + public CompletionReason CompletionReason { get; } + + public bool HasFailure => Failed.Count > 0; + + public int SuccessCount => Succeeded.Count; + public int FailureCount => Failed.Count; + public int StartedCount => Started.Count; + public int TotalCount => All.Count; + + public IReadOnlyList GetResults() + { + var list = new List(Succeeded.Count); + foreach (var item in Succeeded) + { + // Result is non-null on success items by construction; the BCL-typed + // index is preserved by walking Succeeded (already in original order). + list.Add(item.Result!); + } + return list; + } + + public IReadOnlyList GetErrors() + { + var list = new List(Failed.Count); + foreach (var item in Failed) + { + // Error is non-null on failure items by construction. + list.Add(item.Error!); + } + return list; + } + + public void ThrowIfError() + { + foreach (var item in All) + { + if (item.Status == BatchItemStatus.Failed && item.Error != null) + { + throw item.Error; + } + } + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs new file mode 100644 index 000000000..b118ce558 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/BatchSummary.cs @@ -0,0 +1,56 @@ +using System.Text.Json.Serialization; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Internal payload shape stored on a concurrent operation's parent CONTEXT +/// checkpoint (as ContextDetails.Result) and reconstructed on replay. +/// Shared by both and +/// : carries the completion reason and +/// the per-unit index → status map so the can be +/// rebuilt without depending on user T shape. +/// +/// +/// Under per-unit results live on the children's +/// own CONTEXT checkpoints and only (plus +/// index/name) is recorded here. Under the +/// children emit no checkpoint, so each unit's serialized result +/// () or error +/// () is recorded inline here and read back +/// on replay. +/// +internal sealed class BatchSummary +{ + [JsonPropertyName("CompletionReason")] + public string? CompletionReason { get; set; } + + [JsonPropertyName("Units")] + public IList Units { get; set; } = new List(); +} + +internal sealed class BatchUnitSummary +{ + [JsonPropertyName("Index")] + public int Index { get; set; } + + [JsonPropertyName("Name")] + public string? Name { get; set; } + + [JsonPropertyName("Status")] + public string? Status { get; set; } + + /// + /// Serialized per-unit result, recorded inline only for + /// succeeded units (where no child checkpoint + /// exists to read it from). null under . + /// + [JsonPropertyName("Result")] + public string? Result { get; set; } + + /// + /// Per-unit error, recorded inline only for + /// failed units. null under . + /// + [JsonPropertyName("Error")] + public ErrorObject? Error { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs index 800d55bcf..1937f6312 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcher.cs @@ -33,6 +33,36 @@ internal sealed class CheckpointBatcher : IAsyncDisposable private Exception? _terminalError; private int _disposed; + // Per-update wire-footprint estimate constants. Deliberate over-estimates: + // flushing slightly early is safe, flushing late risks a request-too-large. + private const int PerOpEnvelopeOverheadBytes = 512; + private const int StackFrameOverheadBytes = 8; + + /// + /// Cheap UTF-8 byte estimate of one update's wire footprint — variable string + /// fields plus a fixed envelope. No JSON is produced (AOT-safe). Payload is + /// counted at 2x because it is already-serialized JSON re-escaped as a string + /// value, which roughly doubles for escape-heavy content. + /// + private static int EstimateUpdateBytes(SdkOperationUpdate u) + { + var size = PerOpEnvelopeOverheadBytes; + // int arithmetic is safe: payloads are bounded by the 6MB Lambda + // invocation-payload cap, so the 2x multiply can never overflow a 32-bit int. + if (u.Payload != null) size += System.Text.Encoding.UTF8.GetByteCount(u.Payload) * 2; + size += ByteCount(u.Id) + ByteCount(u.ParentId) + ByteCount(u.Name); + if (u.Error != null) + { + size += ByteCount(u.Error.ErrorType) + ByteCount(u.Error.ErrorMessage) + ByteCount(u.Error.ErrorData); + if (u.Error.StackTrace != null) + foreach (var line in u.Error.StackTrace) + size += ByteCount(line) + StackFrameOverheadBytes; + } + return size; + } + + private static int ByteCount(string? s) => s == null ? 0 : System.Text.Encoding.UTF8.GetByteCount(s); + public CheckpointBatcher( string? initialCheckpointToken, Func, CancellationToken, Task> flushAsync, @@ -113,25 +143,43 @@ public async ValueTask DisposeAsync() private async Task RunWorkerAsync(CancellationToken shutdownToken) { - // TODO: also enforce _config.MaxBatchBytes here. Today we only cap by - // operation count; an item whose serialized size pushes the batch over - // ~750 KB will be sent and rejected service-side. See CheckpointBatcherConfig. - var batch = new List(_config.MaxBatchOperations); + // Both caps are enforced: before adding an item that would push the batch + // over MaxBatchOperations OR MaxBatchBytes, the current batch is flushed. + // A lone item already over the byte cap is sent by itself (never loops). + // The byte accumulator is seeded with a fixed reserve covering the request + // prefix (checkpoint token + ARN + array framing) that the per-update + // estimate does not include. + const int RequestEnvelopeReserveBytes = 4 * 1024; + var batch = new PendingBatch(_config.MaxBatchOperations); + + async Task AddItemAsync(BatchItem item) + { + var itemBytes = EstimateUpdateBytes(item.Update); + if (batch.Count > 0 && + (batch.Count + 1 > _config.MaxBatchOperations || + RequestEnvelopeReserveBytes + batch.Bytes + itemBytes > _config.MaxBatchBytes)) + { + await FlushBatchAsync(batch.Items, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + + batch.Add(item); + + // Lone item already over the cap: send it alone, do not loop. + if (batch.Count == 1 && + RequestEnvelopeReserveBytes + batch.Bytes > _config.MaxBatchBytes) + { + await FlushBatchAsync(batch.Items, shutdownToken).ConfigureAwait(false); + batch.Clear(); + } + } try { while (await _channel.Reader.WaitToReadAsync(shutdownToken).ConfigureAwait(false)) { - // Drain everything currently queued. while (_channel.Reader.TryRead(out var item)) - { - batch.Add(item); - if (batch.Count >= _config.MaxBatchOperations) - { - await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); - batch.Clear(); - } - } + await AddItemAsync(item).ConfigureAwait(false); // Optionally wait for late arrivals to coalesce into one batch. if (_config.FlushInterval > TimeSpan.Zero && batch.Count > 0) @@ -143,14 +191,7 @@ private async Task RunWorkerAsync(CancellationToken shutdownToken) while (await _channel.Reader.WaitToReadAsync(windowCts.Token).ConfigureAwait(false)) { while (_channel.Reader.TryRead(out var item)) - { - batch.Add(item); - if (batch.Count >= _config.MaxBatchOperations) - { - await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); - batch.Clear(); - } - } + await AddItemAsync(item).ConfigureAwait(false); } } catch (OperationCanceledException) when (!shutdownToken.IsCancellationRequested) @@ -161,7 +202,7 @@ private async Task RunWorkerAsync(CancellationToken shutdownToken) if (batch.Count > 0) { - await FlushBatchAsync(batch, shutdownToken).ConfigureAwait(false); + await FlushBatchAsync(batch.Items, shutdownToken).ConfigureAwait(false); batch.Clear(); } } @@ -179,9 +220,9 @@ private async Task RunWorkerAsync(CancellationToken shutdownToken) } finally { - // Anything left in the channel after the worker exits — fail it. + // Anything left in the batch/channel after the worker exits — fail it. var failure = Volatile.Read(ref _terminalError) ?? new ObjectDisposedException(nameof(CheckpointBatcher)); - foreach (var leftover in batch) + foreach (var leftover in batch.Items) leftover.Completion.TrySetException(failure); while (_channel.Reader.TryRead(out var item)) item.Completion.TrySetException(failure); @@ -214,5 +255,17 @@ private async Task FlushBatchAsync(IReadOnlyList batch, CancellationT } } + /// Accumulates a batch plus its estimated byte footprint so the two + /// never drift across the worker's add/flush/clear sites. + private sealed class PendingBatch + { + public readonly List Items; + public long Bytes; + public PendingBatch(int capacity) { Items = new List(capacity); } + public int Count => Items.Count; + public void Add(BatchItem item) { Items.Add(item); Bytes += EstimateUpdateBytes(item.Update); } + public void Clear() { Items.Clear(); Bytes = 0; } + } + private readonly record struct BatchItem(SdkOperationUpdate Update, TaskCompletionSource Completion); } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs index 88913e868..81dc85d45 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/CheckpointBatcherConfig.cs @@ -22,15 +22,13 @@ internal sealed class CheckpointBatcherConfig public int MaxBatchOperations { get; init; } = 200; /// - /// Maximum batch size in bytes. Service-side limit is ~750 KB. + /// Maximum batch size in bytes. Service-side request limit is ~750 KB. /// /// - /// TODO: not enforced today. The worker only checks ; - /// a single oversized item (or a batch whose serialized size exceeds 750 KB) - /// will be sent to the service and rejected there. Wire this in alongside - /// the async-flush operations (Map / Parallel / child-context) since those - /// are the scenarios that can actually fill a batch — today every batch is - /// 1 item with = Zero, so the gap is latent. + /// Enforced by the worker: it flushes the current batch before adding an item + /// that would push the estimated request size over this cap, and sends a lone + /// item that already exceeds the cap by itself. The per-update estimate plus a + /// fixed request-prefix reserve approximate the real wire size conservatively. /// - internal int MaxBatchBytes { get; init; } = 750 * 1024; + public int MaxBatchBytes { get; init; } = 750 * 1024; } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs index a0abbf99e..c7472bcbf 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ChildContextOperation.cs @@ -5,6 +5,7 @@ using System.Text; using Amazon.Lambda; using Amazon.Lambda.Core; +using SdkContextOptions = Amazon.Lambda.Model.ContextOptions; using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; @@ -24,6 +25,10 @@ namespace Amazon.Lambda.DurableExecution.Internal; /// and throw . /// SUCCEEDED: return cached deserialized result; user func is /// NOT re-executed. +/// SUCCEEDED (overflow): ReplayChildren=true + empty +/// payload (the result was too large to checkpoint inline) → re-run the +/// user func to recover the large result value; terminal checkpoints +/// (SUCCEED/FAIL) are suppressed since the op is already terminal. /// FAILED: throw with the /// recorded error; if is /// set, the mapped exception is thrown instead. @@ -41,7 +46,10 @@ internal sealed class ChildContextOperation : DurableOperation private readonly Func> _func; private readonly ChildContextConfig? _config; private readonly ILambdaSerializer _serializer; - private readonly Func _childContextFactory; + private readonly Func _childContextFactory; + private readonly bool _isVirtual; + // Set once on overflow-replay re-execution; never reset. + private bool _suppressTerminalCheckpoint; public ChildContextOperation( string operationId, @@ -50,35 +58,45 @@ public ChildContextOperation( Func> func, ChildContextConfig? config, ILambdaSerializer serializer, - Func childContextFactory, + Func childContextFactory, ExecutionState state, TerminationManager termination, string durableExecutionArn, - CheckpointBatcher? batcher = null) + CheckpointBatcher? batcher = null, + bool isVirtual = false) : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) { _func = func; _config = config; _serializer = serializer; _childContextFactory = childContextFactory; + _isVirtual = isVirtual; } protected override string OperationType => OperationTypes.Context; protected override async Task StartAsync(CancellationToken cancellationToken) { - // Sync-flush CONTEXT START before user code so the service has a record - // of the parent context if the inner func suspends (e.g. a Wait inside - // the child terminates the workflow before SUCCEED is reached). - await EnqueueAsync(new SdkOperationUpdate + // Virtual (NestingType.Flat) branches emit no CONTEXT checkpoint of their + // own — the parallel/map orchestrator records their outcome inline on the + // parent payload. Inner operations still checkpoint (re-parented to the + // non-virtual ancestor via the virtual child generator's reported + // ParentId), so a suspend inside a virtual branch is still recoverable. + if (!_isVirtual) { - Id = OperationId, - ParentId = ParentId, - Type = OperationTypes.Context, - Action = OperationAction.START, - SubType = _config?.SubType, - Name = Name - }, cancellationToken); + // Sync-flush CONTEXT START before user code so the service has a record + // of the parent context if the inner func suspends (e.g. a Wait inside + // the child terminates the workflow before SUCCEED is reached). + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = _config?.SubType, + Name = Name + }, cancellationToken); + } return await ExecuteFunc(cancellationToken); } @@ -88,6 +106,14 @@ protected override Task ReplayAsync(Operation existing, CancellationToken can switch (existing.Status) { case OperationStatuses.Succeeded: + // Overflow: the result was too large to checkpoint inline + // (ReplayChildren=true, empty payload). Re-run the body to recover + // the value; the body's inner ops replay from their own + // checkpoints. Do NOT re-emit the (already terminal) SUCCEED. + if (existing.ContextDetails?.ReplayChildren == true) + { + return ExecuteFuncNoCheckpoint(cancellationToken); + } // Side-effecting code runs at most once: replay returns the // cached result without invoking the user func. return Task.FromResult(DeserializeResult(existing.ContextDetails?.Result)); @@ -110,11 +136,21 @@ protected override Task ReplayAsync(Operation existing, CancellationToken can } } + private Task ExecuteFuncNoCheckpoint(CancellationToken cancellationToken) + { + _suppressTerminalCheckpoint = true; + return ExecuteFunc(cancellationToken); + } + private async Task ExecuteFunc(CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); - var childContext = _childContextFactory(OperationId); + // For a virtual (Flat) branch, inner operations report this branch's own + // ParentId — the non-virtual parallel/map ancestor — since the branch + // itself emits no CONTEXT checkpoint to reference. For a normal child + // context the reported parent is ignored (it roots at OperationId). + var childContext = _childContextFactory(OperationId, ParentId, _isVirtual); T result; try @@ -144,16 +180,25 @@ private async Task ExecuteFunc(CancellationToken cancellationToken) } catch (Exception ex) { - await EnqueueAsync(new SdkOperationUpdate + // Virtual branches suppress the FAIL checkpoint but still propagate + // the exception — the orchestrator records the failure inline on the + // parent payload. Overflow-replay re-execution also suppresses it: the + // op is already terminal (SUCCEEDED) in the store, so re-emitting a + // FAIL would corrupt that record (mirrors ReplayChildrenAsync, which + // never re-checkpoints). The exception still propagates below. + if (!_isVirtual && !_suppressTerminalCheckpoint) { - Id = OperationId, - ParentId = ParentId, - Type = OperationTypes.Context, - Action = OperationAction.FAIL, - SubType = _config?.SubType, - Name = Name, - Error = ToSdkError(ex) - }, cancellationToken); + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.FAIL, + SubType = _config?.SubType, + Name = Name, + Error = ToSdkError(ex) + }, cancellationToken); + } throw MapFailureException(new ChildContextException(ex.Message, ex) { @@ -163,16 +208,33 @@ await EnqueueAsync(new SdkOperationUpdate }); } - await EnqueueAsync(new SdkOperationUpdate + // Virtual branches suppress the SUCCEED checkpoint; the orchestrator + // serializes the result inline on the parent payload instead. + // _suppressTerminalCheckpoint is set on overflow replay re-execution: the + // child is already terminal in the store, so we re-run only to recover the + // in-memory value and must NOT re-emit a SUCCEED. + if (!_isVirtual && !_suppressTerminalCheckpoint) { - Id = OperationId, - ParentId = ParentId, - Type = OperationTypes.Context, - Action = OperationAction.SUCCEED, - SubType = _config?.SubType, - Name = Name, - Payload = SerializeResult(result) - }, cancellationToken); + var serialized = SerializeResult(result); + // Overflow: result too large to checkpoint inline. Emit an empty + // payload + ReplayChildren so replay re-executes this body to recover + // the value (mirrors the concurrent-operation overflow strategy). + var overflow = Encoding.UTF8.GetByteCount(serialized) > DurableConstants.MaxOperationCheckpointBytes; + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + ParentId = ParentId, + Type = OperationTypes.Context, + Action = OperationAction.SUCCEED, + SubType = _config?.SubType, + Name = Name, + Payload = overflow ? string.Empty : serialized, + ContextOptions = overflow + ? new SdkContextOptions { ReplayChildren = true } + : null + }, cancellationToken); + } return result; } diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs new file mode 100644 index 000000000..83edc1632 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ConcurrentOperation.cs @@ -0,0 +1,906 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.IO; +using System.Text; +using System.Text.Json; +using Amazon.Lambda; +using Amazon.Lambda.Core; +using SdkContextOptions = Amazon.Lambda.Model.ContextOptions; +using SdkErrorObject = Amazon.Lambda.Model.ErrorObject; +using SdkOperationUpdate = Amazon.Lambda.Model.OperationUpdate; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Shared orchestration base for the concurrent durable operations +/// ( and ). +/// Runs N user-supplied units concurrently (each as a +/// ) under a shared +/// and concurrency limit, persisting the +/// aggregate result so subsequent invocations replay it without re-executing. +/// +/// +/// Subclasses supply only what differs between Parallel and Map — the unit count, +/// how to obtain a unit's (name, func), the parent/child sub-type labels, +/// and the failure-exception factory. All concurrency, completion, checkpoint, and +/// replay logic lives here. +/// +/// Fresh: no prior state → sync-flush parent CONTEXT START → +/// dispatch units respecting MaxConcurrency → wait for in-flight to +/// complete after CompletionConfig short-circuit → emit parent CONTEXT +/// SUCCEED with summary payload (). +/// SUCCEEDED: parent payload supplies the snapshot of per-unit +/// statuses + completion reason; per-unit results are deserialised from the +/// children's own CONTEXT checkpoints. +/// FAILED: same reconstruction; throws the subclass exception +/// carrying the rebuilt . +/// STARTED / PENDING: re-execute (children replay from their +/// own checkpoints). +/// +/// Per-unit errors do NOT abort the operation directly — the orchestrator catches +/// each unit's , records it as a failed +/// , and consults the +/// after every completion. Only when the completion config marks the run as +/// does it throw. +/// +internal abstract class ConcurrentOperation : DurableOperation> +{ + private readonly CompletionConfig _completionConfig; + private readonly int? _maxConcurrency; + + /// + /// True for : per-unit child contexts emit no + /// CONTEXT checkpoint, so their results/errors are recorded inline on this + /// parent operation's payload and read back from + /// there on replay. + /// + private readonly bool _isVirtual; + + /// Serializer used to deserialize per-unit child results on replay. + protected readonly ILambdaSerializer Serializer; + + /// Factory used to build each unit's inner child context. Takes + /// (operationId, reportedParentId, isVirtual). + protected readonly Func ChildContextFactory; + + protected ConcurrentOperation( + string operationId, + string? name, + string? parentId, + CompletionConfig completionConfig, + int? maxConcurrency, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null, + bool isVirtual = false) + : base(operationId, name, parentId, state, termination, durableExecutionArn, batcher) + { + _completionConfig = completionConfig; + _maxConcurrency = maxConcurrency; + Serializer = serializer; + ChildContextFactory = childContextFactory; + _isVirtual = isVirtual; + } + + protected override string OperationType => OperationTypes.Context; + + // ── Subclass hooks ────────────────────────────────────────────────── + + /// The number of units (branches or items) to execute. + protected abstract int UnitCount { get; } + + /// Parent CONTEXT sub-type label (e.g. Parallel / Map). + protected abstract string ParentSubType { get; } + + /// Per-unit child-context sub-type label (e.g. ParallelBranch / MapItem). + protected abstract string ChildSubType { get; } + + /// Singular operation noun used in messages (e.g. "Parallel" / "Map"). + protected abstract string OperationNoun { get; } + + /// Plural unit noun used in messages (e.g. "branches" / "items"). + protected abstract string UnitNounPlural { get; } + + /// + /// Resolves the unit at into its display name and the + /// function to run inside the unit's child context. + /// + protected abstract (string? Name, Func> Func) GetUnit(int index); + + /// + /// Builds the subclass-specific exception thrown when the operation resolves + /// with . + /// + protected abstract DurableExecutionException CreateException(string message, IBatchResult result); + + // ── Orchestration ─────────────────────────────────────────────────── + + protected override async Task> StartAsync(CancellationToken cancellationToken) + { + // Sync-flush parent CONTEXT START. Mirrors ChildContextOperation: if a + // unit suspends (e.g., a Wait inside it), the service needs to know the + // parent existed. + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = OperationAction.START, + SubType = ParentSubType, + Name = Name + }, cancellationToken); + + return await ExecuteUnitsAsync(cancellationToken); + } + + protected override Task> ReplayAsync(Operation existing, CancellationToken cancellationToken) + { + // Overflow replay: the parent was checkpointed with a stripped summary and + // ReplayChildren=true because the inline results exceeded the checkpoint + // limit. Re-execute ONLY the units the frozen summary marks SUCCEEDED or + // FAILED to recover their stripped result VALUE / Error; units marked + // STARTED (short-circuited, never dispatched) are skipped. Per-unit status + // and completion reason stay authoritative from the frozen summary, and the + // parent — already terminal — is NOT re-checkpointed. + var replayChildren = existing.ContextDetails?.ReplayChildren == true + && (existing.Status == OperationStatuses.Succeeded + || existing.Status == OperationStatuses.Failed); + + switch (existing.Status) + { + case OperationStatuses.Succeeded when replayChildren: + case OperationStatuses.Failed when replayChildren: + return ReplayChildrenAsync(existing, cancellationToken); + + case OperationStatuses.Succeeded: + return Task.FromResult(ReconstructFromCheckpoints(existing, throwOnFailure: false)); + + case OperationStatuses.Failed: + // Reconstruct so the caller (and the exception's Result) sees the + // per-unit outcomes; then throw. + var failed = ReconstructFromCheckpoints(existing, throwOnFailure: false); + throw BuildException(failed); + + case OperationStatuses.Started: + case OperationStatuses.Pending: + // Re-run: units replay from their own checkpoints. + return ExecuteUnitsAsync(cancellationToken); + + default: + throw new NonDeterministicExecutionException( + $"{OperationNoun} operation '{Name ?? OperationId}' has unexpected status '{existing.Status}' on replay."); + } + } + + private async Task> ExecuteUnitsAsync(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var unitCount = UnitCount; + var slots = new UnitOutcome[unitCount]; + var dispatched = new bool[unitCount]; + + var maxConcurrency = _maxConcurrency ?? unitCount; + // Optimisation: when MaxConcurrency >= unitCount, skip the semaphore + // entirely. Behaviour is identical, allocations are lower. (Also covers + // the empty-collection case, where unitCount == 0 and no unit runs.) + var semaphore = (maxConcurrency >= unitCount || unitCount == 0) + ? null + : new SemaphoreSlim(maxConcurrency, maxConcurrency); + + var minSuccessful = _completionConfig.MinSuccessful; + var toleratedFailureCount = _completionConfig.ToleratedFailureCount; + var toleratedFailurePercentage = _completionConfig.ToleratedFailurePercentage; + + var succeeded = 0; + var failed = 0; + + var inFlight = new List(unitCount); + + // Units run with the parent's token so cooperative cancellation still + // propagates into user code, but we must NOT abandon already-dispatched + // units while they're still writing checkpoints — that would diverge + // between the original run and replay. The dispatch loop and + // Task.WhenAll below therefore await every in-flight task even when + // cancellation fires; the semaphore is disposed only after those units + // have settled (success, failure, or cooperative OCE). + try + { + try + { + for (var i = 0; i < unitCount; i++) + { + // Volatile reads pair with the Interlocked.Increment writes + // in the onComplete callback. Reads are non-atomic across + // the two counters: at worst we observe slightly stale + // values and dispatch one extra unit before the next + // completion forces a re-check. That's acceptable — the + // post-loop ComputeCompletionReason is the source of truth. + var succSnap = Volatile.Read(ref succeeded); + var failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, unitCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + break; + } + + if (semaphore != null) + { + await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false); + // Re-check after acquiring: the wait may have unblocked + // because earlier units finished and short-circuited the + // operation. + succSnap = Volatile.Read(ref succeeded); + failSnap = Volatile.Read(ref failed); + if (ShouldStopDispatching(succSnap, failSnap, unitCount, + minSuccessful, toleratedFailureCount, toleratedFailurePercentage)) + { + semaphore.Release(); + break; + } + } + + var index = i; + dispatched[index] = true; + inFlight.Add(RunUnitAsync(index, slots, semaphore, cancellationToken, + onComplete: outcome => + { + if (outcome.Status == BatchItemStatus.Succeeded) + Interlocked.Increment(ref succeeded); + else if (outcome.Status == BatchItemStatus.Failed) + Interlocked.Increment(ref failed); + })); + } + } + finally + { + // CRITICAL: wait for every dispatched unit — even on the + // exceptional path (parent-token cancellation mid-dispatch, or a + // synchronous throw out of the loop) — before the semaphore is + // disposed. Otherwise surviving units' Release() calls hit + // ObjectDisposedException, the tasks become unobserved, and they + // keep writing checkpoints out from under us. + // + // We deliberately DO NOT cancel already-running units when a + // short-circuit fires — orphan units that continue writing + // checkpoints would diverge between the original run and replay. + // Letting them finish guarantees determinism: all dispatched units + // end up Succeeded or Failed. Only un-dispatched units surface as + // Started. + if (inFlight.Count > 0) + { + try + { + await Task.WhenAll(inFlight).ConfigureAwait(false); + } + catch + { + // Swallow here — Task.WhenAll only surfaces the first + // exception, but every unit task is now in a terminal + // state and we want to inspect each one individually below + // to decide whether to surface a workflow-level error. The + // Task objects themselves still carry their exceptions, so + // this swallow does not orphan them. + } + } + } + } + finally + { + semaphore?.Dispose(); + } + + // Surface any workflow-level exception (e.g. NonDeterministicExecutionException) + // raised inside a unit. RunUnitAsync re-throws DurableExecutionException + // (other than ChildContextException which is captured into the slot) so the + // task faults with that exception. Take the first such failure: these are + // structural errors, not "unit failed gracefully" outcomes. + foreach (var t in inFlight) + { + if (t.IsFaulted && t.Exception is { } agg) + { + foreach (var inner in agg.InnerExceptions) + { + if (inner is DurableExecutionException dex && inner is not ChildContextException) + { + throw dex; + } + } + } + } + + // Re-throw any pending parent-token cancellation now that units have + // settled and the semaphore has been disposed cleanly. + cancellationToken.ThrowIfCancellationRequested(); + + // Build BatchItems for every unit in original order. + var items = new List>(unitCount); + for (var i = 0; i < unitCount; i++) + { + var (unitName, _) = GetUnit(i); + if (dispatched[i]) + { + var outcome = slots[i]; + items.Add(new BatchItem + { + Index = i, + Name = unitName, + Status = outcome.Status, + Result = outcome.Status == BatchItemStatus.Succeeded ? outcome.Result : default, + Error = outcome.Status == BatchItemStatus.Failed ? outcome.Error : null + }); + } + else + { + items.Add(new BatchItem + { + Index = i, + Name = unitName, + Status = BatchItemStatus.Started, + Result = default, + Error = null + }); + } + } + + var completionReason = ComputeCompletionReason(items, unitCount); + var result = new BatchResult(items, completionReason); + + var failureException = completionReason == CompletionReason.FailureToleranceExceeded + ? BuildException(result) + : null; + + await CheckpointParentResultAsync(result, completionReason, failureException, cancellationToken); + + if (failureException != null) + { + throw failureException; + } + + return result; + } + + /// + /// Overflow-replay path. The parent was checkpointed with a stripped summary + /// (per-unit Index/Name/Status retained; Result/Error dropped) and + /// ReplayChildren=true. Re-executes ONLY the units the frozen summary + /// marks SUCCEEDED or FAILED — to recover their stripped result value / error + /// — and skips units marked STARTED so their bodies do not re-run. Per-unit + /// status and the completion reason come from the frozen summary (authoritative), + /// not from this run's outcomes; the parent is NOT re-checkpointed. + /// + private async Task> ReplayChildrenAsync(Operation frozen, CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var summary = ParseSummary(frozen.ContextDetails?.Result); + var unitCount = UnitCount; + + var items = new List>(unitCount); + for (var i = 0; i < unitCount; i++) + { + var (unitName, _) = GetUnit(i); + var summaryEntry = summary?.Units.FirstOrDefault(b => b.Index == i); + + // Frozen per-unit status is authoritative. + var status = summaryEntry != null + ? DeserializeStatus(summaryEntry.Status) + : BatchItemStatus.Started; + + // Same unit-name drift check as ReconstructFromCheckpoints: code must + // not change the order or name of concurrent units between deployments. + var checkpointedName = summaryEntry?.Name; + if (checkpointedName != null && unitName != null && checkpointedName != unitName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for {OperationNoun.ToLowerInvariant()} unit {i} of operation " + + $"'{Name ?? OperationId}': expected name '{unitName}' but found '{checkpointedName}' " + + $"from a previous invocation. Code must not change the order or name of concurrent " + + $"units between deployments."); + } + var resolvedName = checkpointedName ?? unitName; + + T? unitResult = default; + DurableExecutionException? unitError = null; + + // Re-execute only completed units to recover the stripped value/error. + // STARTED units were short-circuited (never dispatched) originally — + // do NOT run their bodies, so there are no spurious side effects. + if (status == BatchItemStatus.Succeeded || status == BatchItemStatus.Failed) + { + var outcome = await RunSingleUnitAsync(i, cancellationToken).ConfigureAwait(false); + if (status == BatchItemStatus.Succeeded) + { + unitResult = outcome.Result; + } + else + { + // Frozen status is authoritative. If a unit frozen as Failed + // re-executes to success here (non-deterministic body), it stays + // Failed but Error stays null — the original error was stripped on + // overflow and only returns if the body re-throws. Recovering a + // frozen-Succeeded unit's value is the common, supported case. + unitError = outcome.Error; + } + } + + items.Add(new BatchItem + { + Index = i, + Name = resolvedName, + Status = status, + Result = unitResult, + Error = unitError + }); + } + + // Completion reason is pinned from the frozen summary; fall back to + // recomputing only if the summary is absent/corrupt. + var completionReason = summary != null + ? DeserializeCompletionReason(summary.CompletionReason) + : ComputeCompletionReason(items, unitCount); + + var result = new BatchResult(items, completionReason); + + // No re-checkpoint: the parent is already terminal in state. + if (completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildException(result); + } + + return result; + } + + private async Task RunUnitAsync( + int index, + UnitOutcome[] slots, + SemaphoreSlim? semaphore, + CancellationToken cancellationToken, + Action onComplete) + { + try + { + slots[index] = await RunSingleUnitAsync(index, cancellationToken).ConfigureAwait(false); + onComplete(slots[index]); + } + finally + { + // Defensive: with this structure the semaphore is only disposed after + // Task.WhenAll(inFlight) has settled, so this Release should always + // succeed. ObjectDisposedException would indicate a bug elsewhere, but + // we tolerate it here so the task doesn't fault with a noise exception + // that masks the real one. + try + { + semaphore?.Release(); + } + catch (ObjectDisposedException) + { + } + } + } + + /// + /// Builds and runs a single unit's and + /// maps the result/exception to a . Shared by the + /// concurrent dispatch loop () and the overflow + /// ReplayChildren path (). Per-unit graceful + /// failures are captured as ; workflow-level + /// and parent-token-cancellation exceptions propagate. + /// + private async Task RunSingleUnitAsync(int index, CancellationToken cancellationToken) + { + var (unitName, unitFunc) = GetUnit(index); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{index + 1}"); + + var childOp = new ChildContextOperation( + childOpId, + unitName, + OperationId, + unitFunc, + new ChildContextConfig { SubType = ChildSubType }, + Serializer, + ChildContextFactory, + State, + Termination, + DurableExecutionArn, + Batcher, + isVirtual: _isVirtual); + + try + { + var result = await childOp.ExecuteAsync(cancellationToken).ConfigureAwait(false); + return new UnitOutcome { Status = BatchItemStatus.Succeeded, Result = result }; + } + catch (ChildContextException ex) + { + return new UnitOutcome { Status = BatchItemStatus.Failed, Error = ex }; + } + catch (DurableExecutionException) + { + // E.g. NonDeterministicExecutionException — these are not "unit + // failed gracefully" but workflow-level problems. Surface them: + // re-throw out of the operation (the orchestrator's outer flow + // handles it). + throw; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // Parent-token cancellation: per cross-cutting decision Q10, OCE + // escapes unwrapped. Don't write a slot — Task.WhenAll observes + // this and the orchestrator re-throws after settling. + throw; + } + catch (OperationCanceledException ex) + { + // Unit-internal cancellation that is NOT tied to the parent token + // (e.g. the unit's own CancellationTokenSource fired). Treat it as + // a normal per-unit failure rather than killing the operation as + // cancelled. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + return new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + catch (Exception ex) + { + // Wrap unexpected exceptions as ChildContextException — they're + // per-unit failures from the user's POV. + var wrapped = new ChildContextException(ex.Message, ex) + { + SubType = ChildSubType, + ErrorType = ex.GetType().FullName + }; + return new UnitOutcome { Status = BatchItemStatus.Failed, Error = wrapped }; + } + } + + private static bool ShouldStopDispatching( + int succeeded, + int failed, + int totalUnits, + int? minSuccessful, + int? toleratedFailureCount, + double? toleratedFailurePercentage) + { + // Min-successful: short-circuit the moment we have enough wins. + if (minSuccessful is { } min && succeeded >= min) + return true; + + // Failure thresholds short-circuit on too many losses. + if (toleratedFailureCount is { } tfc && failed > tfc) + return true; + + if (toleratedFailurePercentage is { } tfp && totalUnits > 0) + { + var ratio = (double)failed / totalUnits; + if (ratio > tfp) return true; + } + + return false; + } + + private CompletionReason ComputeCompletionReason(IReadOnlyList> items, int totalCount) + { + var failed = 0; + var succeeded = 0; + var started = 0; + + foreach (var item in items) + { + switch (item.Status) + { + case BatchItemStatus.Succeeded: succeeded++; break; + case BatchItemStatus.Failed: failed++; break; + case BatchItemStatus.Started: started++; break; + } + } + + // Failure tolerance: only short-circuit-by-failure when at least one + // failure threshold is explicitly set. The factory CompletionConfig.AllSuccessful() + // sets ToleratedFailureCount = 0 to opt into fail-fast; an "empty" + // CompletionConfig (all properties null) is permissive. + if (_completionConfig.ToleratedFailureCount is { } tfc && failed > tfc) + return CompletionReason.FailureToleranceExceeded; + + if (_completionConfig.ToleratedFailurePercentage is { } tfp && totalCount > 0) + { + var ratio = (double)failed / totalCount; + if (ratio > tfp) return CompletionReason.FailureToleranceExceeded; + } + + // Min-successful satisfied (and we didn't run all units): MinSuccessfulReached. + if (_completionConfig.MinSuccessful is { } min && succeeded >= min && started > 0) + { + return CompletionReason.MinSuccessfulReached; + } + + // Every dispatched unit finished one way or the other (or all-completed + // without any failure criteria). + return CompletionReason.AllCompleted; + } + + private DurableExecutionException BuildException(IBatchResult result) + { + var message = + $"{OperationNoun} operation failed: failure tolerance exceeded " + + $"({result.FailureCount} of {result.TotalCount} {UnitNounPlural} failed)."; + return CreateException(message, result); + } + + private async Task CheckpointParentResultAsync( + BatchResult result, + CompletionReason completionReason, + DurableExecutionException? failureException, + CancellationToken cancellationToken) + { + // Local builder: includeInline=true writes per-unit Result/Error inline + // (Flat only); includeInline=false writes the minimal index/name/status + // map (the shape Nested always uses, and the Flat overflow fallback). + BatchSummary BuildSummary(bool includeInline) + { + var s = new BatchSummary + { + CompletionReason = SerializeCompletionReason(completionReason), + Units = new List(result.All.Count) + }; + for (var i = 0; i < result.All.Count; i++) + { + var item = result.All[i]; + var unit = new BatchUnitSummary + { + Index = item.Index, + Name = item.Name, + Status = SerializeStatus(item.Status) + }; + if (includeInline && _isVirtual) + { + if (item.Status == BatchItemStatus.Succeeded) + unit.Result = SerializeResult(item.Result); + else if (item.Status == BatchItemStatus.Failed && item.Error != null) + unit.Error = ErrorObject.FromException(item.Error); + } + s.Units.Add(unit); + } + return s; + } + + var summary = BuildSummary(includeInline: true); + var payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); + + // Flat overflow: the inline per-unit results pushed the summary over the + // checkpoint limit. Re-emit a stripped summary (statuses only) and flag + // ReplayChildren so replay reconstructs the values by re-executing units. + var overflow = _isVirtual + && Encoding.UTF8.GetByteCount(payload) > DurableConstants.MaxOperationCheckpointBytes; + if (overflow) + { + summary = BuildSummary(includeInline: false); + payload = JsonSerializer.Serialize(summary, BatchJsonContext.Default.BatchSummary); + } + + var failed = failureException != null; + + // On FAIL, Nested operations omit the payload because replay rebuilds + // per-unit outcomes from the children's own checkpoints. Flat operations + // have no child checkpoints, so the summary (carrying inline results and + // errors, or the stripped status map under overflow) must be persisted + // even on FAIL for replay to reconstruct it. + var payloadOnFail = _isVirtual; + + await EnqueueAsync(new SdkOperationUpdate + { + Id = OperationId, + Type = OperationTypes.Context, + Action = failed ? OperationAction.FAIL : OperationAction.SUCCEED, + SubType = ParentSubType, + Name = Name, + Payload = failed && !payloadOnFail ? null : payload, + Error = failed ? BuildAggregateError(result, failureException!) : null, + ContextOptions = overflow + ? new SdkContextOptions { ReplayChildren = true } + : null + }, cancellationToken); + } + + private IBatchResult ReconstructFromCheckpoints(Operation parent, bool throwOnFailure) + { + var summary = ParseSummary(parent.ContextDetails?.Result); + + var items = new List>(UnitCount); + for (var i = 0; i < UnitCount; i++) + { + var (unitName, _) = GetUnit(i); + var childOpId = OperationIdGenerator.HashOperationId($"{OperationId}-{i + 1}"); + var childOp = State.GetOperation(childOpId); + var summaryEntry = summary?.Units.FirstOrDefault(b => b.Index == i); + + BatchItemStatus status = summaryEntry != null + ? DeserializeStatus(summaryEntry.Status) + : InferStatusFromChildOp(childOp); + + // Prefer the name that was checkpointed at the moment the batch + // resolved. This is the only authoritative source for units reported + // as Started (no per-unit checkpoint exists to consult), and it lets + // us detect unit-name drift between deployments. + var checkpointedName = summaryEntry?.Name; + if (checkpointedName != null && unitName != null && checkpointedName != unitName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for {OperationNoun.ToLowerInvariant()} unit {i} of operation " + + $"'{Name ?? OperationId}': expected name '{unitName}' but found '{checkpointedName}' " + + $"from a previous invocation. Code must not change the order or name of concurrent " + + $"units between deployments."); + } + var resolvedName = checkpointedName ?? unitName; + + T? unitResult = default; + DurableExecutionException? unitError = null; + + // Flat (virtual) units have no child checkpoint — their result/error + // was recorded inline on this summary. Nested units read from the + // child's own CONTEXT checkpoint. A unit is "inline" when the summary + // entry carries a Result/Error, which only Flat writes. + if (_isVirtual && summaryEntry != null) + { + if (status == BatchItemStatus.Succeeded && summaryEntry.Result != null) + { + unitResult = DeserializeResult(summaryEntry.Result); + } + else if (status == BatchItemStatus.Failed && summaryEntry.Error != null) + { + var err = summaryEntry.Error; + unitError = new ChildContextException(err.ErrorMessage ?? "Unit failed") + { + SubType = ChildSubType, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + } + else if (status == BatchItemStatus.Succeeded && childOp?.ContextDetails?.Result != null) + { + unitResult = DeserializeResult(childOp.ContextDetails.Result); + } + else if (status == BatchItemStatus.Failed && childOp?.ContextDetails?.Error != null) + { + var err = childOp.ContextDetails.Error; + unitError = new ChildContextException(err.ErrorMessage ?? "Unit failed") + { + SubType = childOp.SubType ?? ChildSubType, + ErrorType = err.ErrorType, + ErrorData = err.ErrorData, + OriginalStackTrace = err.StackTrace + }; + } + + items.Add(new BatchItem + { + Index = i, + Name = resolvedName, + Status = status, + Result = unitResult, + Error = unitError + }); + } + + var completionReason = summary != null + ? DeserializeCompletionReason(summary.CompletionReason) + : ComputeCompletionReason(items, UnitCount); + + var result = new BatchResult(items, completionReason); + + if (throwOnFailure && completionReason == CompletionReason.FailureToleranceExceeded) + { + throw BuildException(result); + } + + return result; + } + + private static BatchItemStatus InferStatusFromChildOp(Operation? childOp) + { + if (childOp == null) return BatchItemStatus.Started; + return childOp.Status switch + { + OperationStatuses.Succeeded => BatchItemStatus.Succeeded, + OperationStatuses.Failed => BatchItemStatus.Failed, + _ => BatchItemStatus.Started + }; + } + + private SdkErrorObject BuildAggregateError(IBatchResult result, DurableExecutionException failureException) + { + return new SdkErrorObject + { + ErrorType = failureException.GetType().FullName, + ErrorMessage = + $"{OperationNoun} operation failed: {result.FailureCount} of {result.TotalCount} {UnitNounPlural} failed." + }; + } + + private static BatchSummary? ParseSummary(string? payload) + { + if (string.IsNullOrEmpty(payload)) return null; + try + { + return JsonSerializer.Deserialize(payload, BatchJsonContext.Default.BatchSummary); + } + catch (JsonException) + { + // Tolerate older / corrupted payloads — fall back to inferring status + // from per-unit checkpoints. + return null; + } + } + + private static string SerializeStatus(BatchItemStatus status) => status switch + { + BatchItemStatus.Succeeded => "SUCCEEDED", + BatchItemStatus.Failed => "FAILED", + BatchItemStatus.Started => "STARTED", + _ => throw new ArgumentOutOfRangeException(nameof(status)) + }; + + private static BatchItemStatus DeserializeStatus(string? wire) => wire switch + { + "SUCCEEDED" => BatchItemStatus.Succeeded, + "FAILED" => BatchItemStatus.Failed, + "STARTED" => BatchItemStatus.Started, + _ => BatchItemStatus.Started + }; + + private static string SerializeCompletionReason(CompletionReason reason) => reason switch + { + CompletionReason.AllCompleted => "ALL_COMPLETED", + CompletionReason.MinSuccessfulReached => "MIN_SUCCESSFUL_REACHED", + CompletionReason.FailureToleranceExceeded => "FAILURE_TOLERANCE_EXCEEDED", + _ => throw new ArgumentOutOfRangeException(nameof(reason)) + }; + + private static CompletionReason DeserializeCompletionReason(string? wire) => wire switch + { + "ALL_COMPLETED" => CompletionReason.AllCompleted, + "MIN_SUCCESSFUL_REACHED" => CompletionReason.MinSuccessfulReached, + "FAILURE_TOLERANCE_EXCEEDED" => CompletionReason.FailureToleranceExceeded, + _ => CompletionReason.AllCompleted + }; + + private T DeserializeResult(string serialized) + { + var bytes = Encoding.UTF8.GetBytes(serialized); + using var ms = new MemoryStream(bytes); + return Serializer.Deserialize(ms); + } + + /// + /// Serializes a per-unit result for inline storage in the + /// (Flat units only). Mirrors the SUCCEED-payload + /// serialization a Nested unit's would + /// have written to its own checkpoint. + /// + private string SerializeResult(T? value) + { + using var ms = new MemoryStream(); + Serializer.Serialize(value!, ms); + return Encoding.UTF8.GetString(ms.ToArray()); + } + + /// + /// Internal scratch space tracking each unit's outcome as it lands in the + /// executor; copied into the user-facing once every + /// dispatched unit has settled. + /// + private struct UnitOutcome + { + public BatchItemStatus Status; + public T? Result; + public DurableExecutionException? Error; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs new file mode 100644 index 000000000..0aed925b2 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/DurableConstants.cs @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Size limits for durable-execution payload overflow handling. These are the +/// SDK's chosen overflow *trigger* thresholds for cross-SDK parity (Python/Java +/// use the same 256 KB), not the AWSSDK.Lambda hard field caps (those are 6 MB). +/// +internal static class DurableConstants +{ + /// + /// Serialized-payload byte length above which a concurrent/child-context + /// operation switches to the ReplayChildren overflow strategy: + /// strip the inline result from the checkpoint and reconstruct on replay by + /// re-executing the unit/child bodies. 256 KB (262,144 bytes). + /// + internal const int MaxOperationCheckpointBytes = 256 * 1024; + + /// + /// Serialized final-result byte length above which the orchestration response + /// must be checkpointed rather than returned inline (Lambda response limit, + /// minus a small envelope margin). Reserved for the final-response overflow + /// work (separate plan); defined here so all overflow limits live together. + /// + internal const int MaxLambdaResponseBytes = 6 * 1024 * 1024 - 50; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs index 989749d9b..7ff404675 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ExecutionState.cs @@ -1,8 +1,6 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -using System.Collections.Concurrent; - namespace Amazon.Lambda.DurableExecution.Internal; /// @@ -23,54 +21,74 @@ namespace Amazon.Lambda.DurableExecution.Internal; /// for the rest of the invocation. /// /// -/// is invoked from the 's -/// background worker (via the onNewOperations hook) while the workflow thread -/// concurrently reads via / — -/// e.g. the fire-and-forget StepOperation path where the workflow is not -/// awaiting the flush. _operations is therefore a . -/// The replay-tracking fields (_visitedOperations, _isReplaying, -/// _remainingReplayOps) are touched only on the workflow thread. +/// Thread safety: two paths reach this type concurrently. (1) The +/// background worker invokes +/// (via the onNewOperations hook) while the +/// workflow thread reads via / — +/// e.g. the fire-and-forget StepOperation path. (2) +/// dispatches N branches concurrently, each +/// running its own , so +/// , , +/// , and the +/// getter are reachable from multiple threads at once. +/// All read/write access to _operations, _visitedOperations, +/// _isReplaying and _remainingReplayOps is therefore guarded by a +/// single private lock. Every guarded path is an O(1) dictionary lookup, set +/// insert, or short iteration, so contention stays brief; we use a plain +/// lock rather than because +/// none of the guarded code paths are async, and rather than +/// ConcurrentDictionary because performs +/// a compound add-then-scan. /// /// internal sealed class ExecutionState { - private readonly ConcurrentDictionary _operations = new(); + private readonly object _lock = new(); + private readonly Dictionary _operations = new(); private readonly HashSet _visitedOperations = new(); private bool _isReplaying; private int _remainingReplayOps; - public int CheckpointedOperationCount => _operations.Count; + public int CheckpointedOperationCount + { + get { lock (_lock) return _operations.Count; } + } /// /// True when the workflow is re-deriving prior operations from checkpointed /// state. False when running fresh (not-yet-checkpointed) code. /// - public bool IsReplaying => _isReplaying; + public bool IsReplaying + { + get { lock (_lock) return _isReplaying; } + } public void LoadFromCheckpoint(InitialExecutionState? initialState) { - if (initialState?.Operations != null) + lock (_lock) { - AddOperations(initialState.Operations); + if (initialState?.Operations != null) + { + AddOperationsLocked(initialState.Operations); + } + + // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, + // CANCELLED, STOPPED) we need to re-derive before resuming live work. + // The service-side EXECUTION op (input payload bookkeeping) is always + // present and doesn't count. If the only ops are in-progress + // (READY/PENDING/STARTED), there's nothing to re-derive — the next + // user call IS the next thing to run — so IsReplaying starts false. + var (_, terminalCount) = ScanReplayableLocked(); + _remainingReplayOps = terminalCount; + _isReplaying = terminalCount > 0; } - - // We're "replaying" when there are completed ops (SUCCEEDED, FAILED, - // CANCELLED, STOPPED) we need to re-derive before resuming live work. - // The service-side EXECUTION op (input payload bookkeeping) is always - // present and doesn't count. If the only ops are in-progress - // (READY/PENDING/STARTED), there's nothing to re-derive — the next - // user call IS the next thing to run — so IsReplaying starts false. - var (_, terminalCount) = ScanReplayable(); - _remainingReplayOps = terminalCount; - _isReplaying = terminalCount > 0; } public void AddOperations(IEnumerable operations) { - foreach (var op in operations) + lock (_lock) { - if (op.Id == null) continue; - _operations[op.Id] = op; + AddOperationsLocked(operations); } } @@ -81,11 +99,20 @@ public void AddOperations(IEnumerable operations) /// public Operation? GetOperation(string operationId) { - _operations.TryGetValue(operationId, out var op); - return op; + lock (_lock) + { + _operations.TryGetValue(operationId, out var op); + return op; + } } - public bool HasOperation(string operationId) => _operations.ContainsKey(operationId); + public bool HasOperation(string operationId) + { + lock (_lock) + { + return _operations.ContainsKey(operationId); + } + } /// /// Records that the workflow has reached . @@ -96,43 +123,58 @@ public void AddOperations(IEnumerable operations) /// public void TrackReplay(string operationId) { - if (!_isReplaying) return; - if (!_visitedOperations.Add(operationId)) return; - if (!_operations.TryGetValue(operationId, out var op)) return; - if (op.Type == OperationTypes.Execution) return; - if (!IsTerminalStatus(op.Status)) return; - - if (--_remainingReplayOps <= 0) - _isReplaying = false; + lock (_lock) + { + if (!_isReplaying) return; + if (!_visitedOperations.Add(operationId)) return; + if (!_operations.TryGetValue(operationId, out var op)) return; + if (op.Type == OperationTypes.Execution) return; + if (!IsTerminalStatus(op.Status)) return; + + if (--_remainingReplayOps <= 0) + _isReplaying = false; + } } public void ValidateReplayConsistency(string operationId, string expectedType, string? expectedName) { - // Independent of IsReplaying: as long as a checkpoint record exists - // for this id, its type/name must match what user code is asking for. - // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), - // IsReplaying is false but the records still exist and code drift can - // still produce a mismatch. - if (!_operations.TryGetValue(operationId, out var op)) return; - - if (op.Type != null && op.Type != expectedType) + lock (_lock) { - throw new NonDeterministicExecutionException( - $"Non-deterministic execution detected for operation '{operationId}': " + - $"expected type '{expectedType}' but found '{op.Type}' from a previous invocation. " + - $"Code must not change the order or type of durable operations between deployments."); + // Independent of IsReplaying: as long as a checkpoint record exists + // for this id, its type/name must match what user code is asking for. + // If the only checkpointed ops are in-progress (PENDING/READY/STARTED), + // IsReplaying is false but the records still exist and code drift can + // still produce a mismatch. + if (!_operations.TryGetValue(operationId, out var op)) return; + + if (op.Type != null && op.Type != expectedType) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected type '{expectedType}' but found '{op.Type}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } + + if (expectedName != null && op.Name != null && op.Name != expectedName) + { + throw new NonDeterministicExecutionException( + $"Non-deterministic execution detected for operation '{operationId}': " + + $"expected name '{expectedName}' but found '{op.Name}' from a previous invocation. " + + $"Code must not change the order or type of durable operations between deployments."); + } } + } - if (expectedName != null && op.Name != null && op.Name != expectedName) + private void AddOperationsLocked(IEnumerable operations) + { + foreach (var op in operations) { - throw new NonDeterministicExecutionException( - $"Non-deterministic execution detected for operation '{operationId}': " + - $"expected name '{expectedName}' but found '{op.Name}' from a previous invocation. " + - $"Code must not change the order or type of durable operations between deployments."); + if (op.Id == null) continue; + _operations[op.Id] = op; } } - private (bool HasReplayable, int TerminalCount) ScanReplayable() + private (bool HasReplayable, int TerminalCount) ScanReplayableLocked() { var has = false; var count = 0; diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs new file mode 100644 index 000000000..ed23ba950 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/MapOperation.cs @@ -0,0 +1,76 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using System.Globalization; +using Amazon.Lambda; +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable map operation. Processes a collection in parallel, running the +/// user-supplied function once per item — each as a +/// . All orchestration, completion, +/// checkpoint, and replay logic lives in ; +/// this subclass supplies only the map-specific bits: how to turn an item index +/// into a (name, func) pair (the per-item callback receives the item, its +/// index, and the full source list), the Map sub-type labels, and the +/// factory. +/// +internal sealed class MapOperation : ConcurrentOperation +{ + private readonly IReadOnlyList _items; + private readonly Func, Task> _func; + private readonly Func? _itemNamer; + + public MapOperation( + string operationId, + string? name, + string? parentId, + IReadOnlyList items, + Func, Task> func, + MapConfig config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, + serializer, childContextFactory, state, termination, durableExecutionArn, batcher, + isVirtual: config.NestingType == NestingType.Flat) + { + _items = items; + _func = func; + _itemNamer = config.ItemNamer; + } + + protected override int UnitCount => _items.Count; + protected override string ParentSubType => OperationSubTypes.Map; + protected override string ChildSubType => OperationSubTypes.MapItem; + protected override string OperationNoun => "Map"; + protected override string UnitNounPlural => "items"; + + protected override (string? Name, Func> Func) GetUnit(int index) + { + var item = _items[index]; + // Default name is the index — matches the unnamed-branch convention in + // ParallelAsync. A custom ItemNamer can derive a readable name from the + // item's content. Naming affects observability only, never replay + // correlation (child operation IDs are derived from the index). + var name = _itemNamer is not null + ? _itemNamer(item!, index) + : index.ToString(CultureInfo.InvariantCulture); + + return (name, ctx => _func(ctx, item, index, _items)); + } + + protected override DurableExecutionException CreateException(string message, IBatchResult result) + { + return new MapException(message) + { + Result = result, + CompletionReason = result.CompletionReason + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs index bbfd3c59d..bd74e6da5 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/OperationIdGenerator.cs @@ -35,10 +35,33 @@ public OperationIdGenerator() /// hash("<parentHash>-1"), hash("<parentHash>-2"), etc. /// public OperationIdGenerator(string? parentId) + : this(idPrefix: parentId, reportedParentId: parentId) + { + } + + /// + /// Creates a child generator that decouples the hash prefix used to derive + /// inner-operation IDs from the reported on those + /// operations' wire OperationUpdate.ParentId. + /// + /// + /// Prefix hashed into inner-operation IDs (hash("<idPrefix>-1"), ...). + /// Always the owning context's own operation ID, so two sibling branches + /// never collide on inner IDs. + /// + /// + /// The parent operation ID stamped on inner operations. For a normal + /// (non-virtual) context this equals . For a + /// branch — a "virtual" context that emits no + /// CONTEXT checkpoint of its own — this is the nearest non-virtual ancestor + /// (the parallel/map operation), so inner operations re-parent past the + /// branch to an operation that actually exists in the checkpoint store. + /// + private OperationIdGenerator(string? idPrefix, string? reportedParentId) { _counter = 0; - ParentId = parentId; - _prefix = parentId != null ? parentId + "-" : string.Empty; + ParentId = reportedParentId; + _prefix = idPrefix != null ? idPrefix + "-" : string.Empty; } /// @@ -85,6 +108,19 @@ public OperationIdGenerator CreateChild(string operationId) return new OperationIdGenerator(operationId); } + /// + /// Creates a child generator for a branch — a + /// "virtual" context. Inner-operation IDs are still derived from + /// (so sibling branches don't collide), but + /// the IDs are reported under (the + /// nearest non-virtual ancestor) because the virtual branch emits no CONTEXT + /// checkpoint that inner operations could reference as their parent. + /// + public OperationIdGenerator CreateVirtualChild(string operationId, string? reportedParentId) + { + return new OperationIdGenerator(idPrefix: operationId, reportedParentId: reportedParentId); + } + /// /// Resets the counter (used for testing only). Not safe to call concurrently /// with ; tests must quiesce before resetting. diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs new file mode 100644 index 000000000..08b7d1781 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Internal/ParallelOperation.cs @@ -0,0 +1,60 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda; +using Amazon.Lambda.Core; + +namespace Amazon.Lambda.DurableExecution.Internal; + +/// +/// Durable parallel operation. Runs N user-supplied branches concurrently, +/// each as a . All orchestration, +/// completion, checkpoint, and replay logic lives in +/// ; this subclass supplies only the +/// branch-specific bits (unit count, per-branch (name, func), sub-type +/// labels, and the failure-exception factory). +/// +internal sealed class ParallelOperation : ConcurrentOperation +{ + private readonly IReadOnlyList> _branches; + + public ParallelOperation( + string operationId, + string? name, + string? parentId, + IReadOnlyList> branches, + ParallelConfig config, + ILambdaSerializer serializer, + Func childContextFactory, + ExecutionState state, + TerminationManager termination, + string durableExecutionArn, + CheckpointBatcher? batcher = null) + : base(operationId, name, parentId, config.CompletionConfig, config.MaxConcurrency, + serializer, childContextFactory, state, termination, durableExecutionArn, batcher, + isVirtual: config.NestingType == NestingType.Flat) + { + _branches = branches; + } + + protected override int UnitCount => _branches.Count; + protected override string ParentSubType => OperationSubTypes.Parallel; + protected override string ChildSubType => OperationSubTypes.ParallelBranch; + protected override string OperationNoun => "Parallel"; + protected override string UnitNounPlural => "branches"; + + protected override (string? Name, Func> Func) GetUnit(int index) + { + var branch = _branches[index]; + return (branch.Name, branch.Func); + } + + protected override DurableExecutionException CreateException(string message, IBatchResult result) + { + return new ParallelException(message) + { + Result = result, + CompletionReason = result.CompletionReason + }; + } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs new file mode 100644 index 000000000..5b7c76e5f --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/MapConfig.cs @@ -0,0 +1,75 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for +/// . +/// +/// +/// Per-item checkpoint payloads are serialized via the +/// registered on +/// (typically +/// configured via LambdaBootstrapBuilder.Create(handler, serializer)); +/// this config does not expose a serializer slot. +/// +public sealed class MapConfig +{ + private int? _maxConcurrency; + + /// + /// Maximum number of items processed concurrently. null (default) = + /// unlimited. Must be at least 1 when set. + /// + /// + /// Thrown by the setter if the value is less than or equal to 0. + /// + public int? MaxConcurrency + { + get => _maxConcurrency; + set + { + if (value is { } v && v <= 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MaxConcurrency must be at least 1, or null for unlimited."); + } + _maxConcurrency = value; + } + } + + /// + /// When the map operation is considered complete. Defaults to + /// — every item runs regardless + /// of per-item failures, which are surfaced via + /// rather than thrown. + /// + /// + /// This permissive default matches the Python and Java SDKs' map operation. + /// It differs intentionally from , + /// which defaults to (fail-fast). + /// For fail-fast map behavior — any item failure surfaces a + /// when the result is awaited — set this to + /// , or call + /// on the result. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted(); + + /// + /// How item branches are represented in the checkpoint graph. Defaults to + /// . + /// + /// + /// Under each item runs in a virtual context + /// that emits no per-item CONTEXT checkpoint; per-item results and + /// errors are recorded inline on the map operation's payload instead. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; + + /// + /// Optional function to generate a custom name for each item's branch. + /// Receives the item and its zero-based index, and returns the branch name + /// surfaced in execution traces and on . + /// When null (default), branches are named by index ("0", + /// "1", ...), matching . + /// + public Func? ItemNamer { get; set; } +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs new file mode 100644 index 000000000..a36c793e7 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/NestingType.cs @@ -0,0 +1,36 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Controls how branches in a parallel/map operation are represented in the +/// checkpoint graph. +/// +/// +/// +/// is the default — each branch produces a full CONTEXT +/// operation visible in execution traces. +/// +/// +/// uses virtual contexts to reduce checkpoint volume (no +/// per-branch CONTEXT operation): each branch's result or error is +/// recorded inline on the parent parallel/map operation's payload instead. +/// +/// +public enum NestingType +{ + /// + /// Each branch creates a full isolated CONTEXT operation. Higher + /// observability in execution traces but more checkpoint operations + /// (default). + /// + Nested, + + /// + /// Branches run in virtual contexts that emit no CONTEXT checkpoint + /// of their own — per-branch results/errors are recorded inline on the + /// parent operation's payload. Reduces checkpoint cost at the expense of + /// less granular execution traces. Branch operations inside a flat branch + /// (steps, waits) still checkpoint, re-parented to the parallel/map + /// operation. + /// + Flat +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs index 3b55cfa86..c6fddcf92 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Operation.cs @@ -143,6 +143,14 @@ public sealed class ContextDetails /// Error from the child context, if any. [JsonPropertyName("Error")] public ErrorObject? Error { get; set; } + + /// + /// When true on a completed CONTEXT operation, the operation's result + /// was too large to checkpoint inline; per-unit/child state is reconstructed + /// on replay by re-executing the children rather than read from this payload. + /// + [JsonPropertyName("ReplayChildren")] + public bool? ReplayChildren { get; set; } } /// @@ -195,6 +203,18 @@ public static class OperationSubTypes /// Child-context sub-type. public const string Context = "Context"; + + /// Parallel parent sub-type. + public const string Parallel = "Parallel"; + + /// Parallel branch (per-branch child-context) sub-type. + public const string ParallelBranch = "ParallelBranch"; + + /// Map parent sub-type. + public const string Map = "Map"; + + /// Map item (per-item child-context) sub-type. + public const string MapItem = "MapItem"; } /// diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs new file mode 100644 index 000000000..bcc17f181 --- /dev/null +++ b/Libraries/src/Amazon.Lambda.DurableExecution/ParallelConfig.cs @@ -0,0 +1,58 @@ +namespace Amazon.Lambda.DurableExecution; + +/// +/// Configuration for +/// . +/// +/// +/// Per-branch checkpoint payloads are serialized via the +/// registered on +/// (typically +/// configured via LambdaBootstrapBuilder.Create(handler, serializer)); +/// this config does not expose a serializer slot. +/// +public sealed class ParallelConfig +{ + private int? _maxConcurrency; + + /// + /// Maximum number of branches running concurrently. null (default) = + /// unlimited. Must be at least 1 when set. + /// + /// + /// Thrown by the setter if the value is less than or equal to 0. + /// + public int? MaxConcurrency + { + get => _maxConcurrency; + set + { + if (value is { } v && v <= 0) + { + throw new ArgumentOutOfRangeException(nameof(value), v, + "MaxConcurrency must be at least 1, or null for unlimited."); + } + _maxConcurrency = value; + } + } + + /// + /// When the parallel operation is considered complete. Defaults to + /// — any single branch failure + /// surfaces as a when the parallel result + /// is awaited. + /// + public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful(); + + /// + /// How branches are represented in the checkpoint graph. Defaults to + /// . + /// + /// + /// Under each branch runs in a virtual + /// context that emits no per-branch CONTEXT checkpoint; per-branch + /// results and errors are recorded inline on the parallel operation's + /// payload instead. + /// + public NestingType NestingType { get; set; } = NestingType.Nested; +} diff --git a/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs index a38dda31b..d787a529b 100644 --- a/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs +++ b/Libraries/src/Amazon.Lambda.DurableExecution/Services/LambdaDurableServiceClient.cs @@ -161,7 +161,8 @@ private static Operation MapFromSdkOperation(SdkOperation sdkOp) ContextDetails = sdkOp.ContextDetails != null ? new ContextDetails { Result = sdkOp.ContextDetails.Result, - Error = MapError(sdkOp.ContextDetails.Error) + Error = MapError(sdkOp.ContextDetails.Error), + ReplayChildren = sdkOp.ContextDetails.ReplayChildren } : null, CallbackDetails = sdkOp.CallbackDetails != null ? new CallbackDetails { @@ -177,6 +178,9 @@ private static Operation MapFromSdkOperation(SdkOperation sdkOp) }; } + /// Test-only access to . + internal static Operation MapFromSdkOperationForTest(SdkOperation sdkOp) => MapFromSdkOperation(sdkOp); + /// /// Maps an SDK into the /// internal . Carries every field the wire object diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs new file mode 100644 index 000000000..06ab716c0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFailureToleranceTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFailureToleranceTest +{ + private readonly ITestOutputHelper _output; + public MapFailureToleranceTest(ITestOutputHelper output) => _output = output; + + /// + /// Five items, two fail, ToleratedFailureCount=1. The map must surface a + /// with reason + /// ; the workflow must + /// terminate FAILED. Validates the failure-tolerance short-circuit and that + /// MapException (not ParallelException) propagates as the + /// workflow's terminal error. + /// + [Fact] + public async Task Map_FailureToleranceExceeded_FailsWorkflow() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFailureToleranceFunction"), + "mtol", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m3"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller — locate the + // execution by name to inspect its terminal status. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + // MapException is the terminal error type the SDK throws when the + // failure-tolerance short-circuit fires. + var errorType = execution.Error.ErrorType ?? string.Empty; + var errorMessage = execution.Error.ErrorMessage ?? string.Empty; + Assert.True( + errorType.Contains("MapException", StringComparison.Ordinal) + || errorMessage.Contains("Map", StringComparison.OrdinalIgnoreCase), + $"Expected error to indicate MapException; got type='{errorType}' message='{errorMessage}'"); + + // History: parent CONTEXT and at least 2 failed item contexts visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.EventType == EventType.ContextFailed) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.True( + events.Count(e => e.EventType == EventType.ContextFailed) >= 2, + $"Expected >= 2 ContextFailed events; got {events.Count(e => e.EventType == EventType.ContextFailed)}"); + + // The parent context (named "tolerance") records the aggregate failure. + var parentFailed = events.FirstOrDefault(e => + e.EventType == EventType.ContextFailed && e.Name == "tolerance"); + Assert.NotNull(parentFailed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs new file mode 100644 index 000000000..737e70a2f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFirstSuccessfulTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFirstSuccessfulTest +{ + private readonly ITestOutputHelper _output; + public MapFirstSuccessfulTest(ITestOutputHelper output) => _output = output; + + /// + /// Four items with staggered durable waits, FirstSuccessful: as soon + /// as one item completes, the map resolves. In-flight items remain in + /// rather than being cancelled. + /// Validates the cross-cutting decision: orphan units are NOT cancelled, and + /// short-circuit reports them as Started. + /// + [Fact] + public async Task Map_FirstSuccessful_ShortCircuitsOnFirstWin() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFirstSuccessfulFunction"), + "mfirst", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m4"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Wait timer = 8s, plus invocation overhead. Generous timeout for CI variance. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var winnerIndex = doc.RootElement.GetProperty("WinnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("WinnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("CompletionReason").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + + // At least one item succeeded — the workflow short-circuited as soon as + // the first win materialised. The fastest item is index 1 (1s wait). + Assert.True(successCount >= 1, $"Expected >= 1 successful item, got {successCount}"); + Assert.True(winnerIndex >= 0 && winnerIndex < 4, + $"WinnerIndex should be a valid item index, got {winnerIndex}"); + Assert.NotNull(winnerName); + Assert.NotEqual("FailureToleranceExceeded", completionReason); + + // Service-side: the parent CONTEXT and at least the winning item CONTEXT + // succeeded. Other items' final state is timing-dependent (the + // orchestrator does not cancel in-flight units on short-circuit). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded && e.Name == "race") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var parentSucceeded = events.FirstOrDefault(e => + e.EventType == EventType.ContextSucceeded && e.Name == "race"); + Assert.NotNull(parentSucceeded); + + // The winning item's CONTEXT SUCCEEDED is in the history. + Assert.Contains(events, e => e.EventType == EventType.ContextSucceeded && e.Name == winnerName); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs new file mode 100644 index 000000000..b1c3f1e1a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapFlatNestingTest.cs @@ -0,0 +1,126 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapFlatNestingTest +{ + private readonly ITestOutputHelper _output; + public MapFlatNestingTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Item op ids are + /// SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way under + /// the item op id. Reproduced locally because OperationIdGenerator is internal + /// to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end map: three items, each with a + /// step + a durable wait (the wait forces a suspend/resume cycle so the map + /// actually replays). Verifies the Flat-specific contract against the real + /// durable-execution service: + /// 1. NO per-item CONTEXT events are emitted — only the parent Map CONTEXT. + /// 2. Each item's inner step/wait ops RE-PARENT to the Map op (the nearest + /// non-virtual ancestor), since the virtual item emits no CONTEXT + /// checkpoint to reference as a parent. + /// 3. Inner-op ids are still derived from the item op id space. + /// 4. The per-item result survives replay (read back from the inline parent + /// payload, not a per-item checkpoint). + /// + [Fact] + public async Task Map_Flat_SuppressesItemContexts_AndReparentsInnerOps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapFlatNestingFunction"), + "mflat", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "mf1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The map parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var itemOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each item's "generate" step is the 1st inner op under that item's own + // id space: SHA256("-1"). + var expectedStepIds = itemOpIds.Select(i => HashOpId($"{i}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three items' inner step + // + wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Exactly ONE CONTEXT operation exists — the parent Map op. No per-item + // CONTEXT events under Flat. + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && itemOpIds.Contains(e.Id))); + + // 2. Each item's "generate" step re-parents to the Map op (NOT to its + // virtual item op). + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + // 3. ...but the step ids are still derived from the per-item id space, so + // the three items' first steps are distinct and match the expected + // SHA256("-1") values. + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened with no per-item checkpoint. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response carries the joined per-item results. + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs new file mode 100644 index 000000000..6ee451049 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapHappyPathTest.cs @@ -0,0 +1,75 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapHappyPathTest +{ + private readonly ITestOutputHelper _output; + public MapHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path map: three items each processed in a step, and the + /// workflow returns the joined results. Validates the parent CONTEXT and + /// per-item CONTEXT checkpoints all land in the service-side history with the + /// correct (ItemNamer-derived) names and ordering. + /// + [Fact] + public async Task Map_AllItemsSucceed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapHappyPathFunction"), + "mhappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m1"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The user-visible payload contains all three item outputs in index + // order (the SDK preserves index order even when items race). + Assert.Contains("order-1-m1", responsePayload); + Assert.Contains("order-2-m1", responsePayload); + Assert.Contains("order-3-m1", responsePayload); + + // History is eventually consistent — wait until the parent CONTEXT and + // all three item CONTEXT checkpoints are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 4, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Parent + 3 items = 4 ContextStarted, 4 ContextSucceeded. + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextSucceeded)); + + // The three items show up by their ItemNamer name on their own + // ContextStarted events. + var startedNames = events + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Name) + .ToList(); + Assert.Contains("process_all", startedNames); + Assert.Contains("item-order-1", startedNames); + Assert.Contains("item-order-2", startedNames); + Assert.Contains("item-order-3", startedNames); + + // Each item ran one step => 3 StepSucceeded. + Assert.Equal(3, events.Count(e => e.EventType == EventType.StepSucceeded)); + + // No item failed. + Assert.Empty(events.Where(e => e.EventType == EventType.ContextFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs new file mode 100644 index 000000000..7c55418e7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapMaxConcurrencyTest.cs @@ -0,0 +1,69 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapMaxConcurrencyTest +{ + private readonly ITestOutputHelper _output; + public MapMaxConcurrencyTest(ITestOutputHelper output) => _output = output; + + /// + /// 6 items, each with a 2-second durable wait, MaxConcurrency = 2. Validates + /// the semaphore actually throttles dispatch: timestamps must cluster into + /// waves rather than all six firing simultaneously. Timing tolerance is + /// intentionally generous to avoid CI flakiness; the load-bearing assertion + /// is "not all 6 ran at once". + /// + [Fact] + public async Task Map_MaxConcurrency_ThrottlesItemDispatch() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapMaxConcurrencyFunction"), + "mmaxc", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m5"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 waves x 2s waits + invocation overhead. Allow generous headroom. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + Assert.Equal(6, successCount); + + var timestamps = doc.RootElement.GetProperty("Timestamps") + .EnumerateArray().Select(t => t.GetInt64()).ToList(); + Assert.Equal(6, timestamps.Count); + + var sorted = timestamps.OrderBy(t => t).ToList(); + var minTs = sorted[0]; + var relative = sorted.Select(t => t - minTs).ToList(); + _output.WriteLine($"Relative timestamps (ms): {string.Join(", ", relative)}"); + + // Tolerant clustering: with MaxConcurrency=2 and 2s waits, the first wave + // should hold ~2 items. Strict 3-wave clustering can be flaky under + // service jitter, so we assert the weaker (still meaningful) property: + // not all 6 items fired in the same wave. + var firstWave = relative.Where(r => r < 1500).Count(); + Assert.True(firstWave <= 3, + $"Expected MaxConcurrency=2 to limit the first wave to ~2 items; got {firstWave} within 1500ms of start. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + + // The full set must span at least one wave-gap (~2s) — proving items did + // NOT all run at once. + var total = sorted[^1] - sorted[0]; + Assert.True(total >= 1500, + $"Expected items to span >= 1500ms (proves throttling); got {total}ms. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs new file mode 100644 index 000000000..6a29c18df --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapPartialFailureTest.cs @@ -0,0 +1,75 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapPartialFailureTest +{ + private readonly ITestOutputHelper _output; + public MapPartialFailureTest(ITestOutputHelper output) => _output = output; + + /// + /// Three items, one throws, two succeed — with NO config supplied. Map's + /// default CompletionConfig is AllCompleted() (permissive), + /// unlike Parallel's AllSuccessful(). This validates the headline + /// Map-vs-Parallel behavioral difference end-to-end: a partial failure does + /// NOT fail the workflow; it surfaces success/failure counts and per-item + /// errors through the service round-trip and back into the rebuilt + /// . + /// + [Fact] + public async Task Map_PartialFailure_DefaultIsPermissive_ReportsCounts() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapPartialFailureFunction"), + "mpartial", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m2"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + // Permissive default means partial failure is NOT a workflow failure — + // the workflow accepted the failure and returned a result. + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("FailureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("ErrorSummary").GetString(); + + Assert.Equal(2, successCount); + Assert.Equal(1, failureCount); + Assert.NotNull(errorSummary); + Assert.Contains("intentional partial failure", errorSummary); + + // History: 1 parent + 3 items = 4 ContextStarted; 3 ContextSucceeded + // (parent + 2 ok items); 1 ContextFailed (the boom item). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(3, events.Count(e => e.EventType == EventType.ContextSucceeded)); + Assert.Equal(1, events.Count(e => e.EventType == EventType.ContextFailed)); + + // The failing item's checkpoint preserves the exception message. Its + // branch name is the default index ("1", the middle item). + var failedEvent = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed); + Assert.NotNull(failedEvent); + Assert.Equal("1", failedEvent!.Name); + Assert.Contains("intentional partial failure", + failedEvent.ContextFailedDetails?.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs new file mode 100644 index 000000000..02b867958 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/MapReplayDeterminismTest.cs @@ -0,0 +1,114 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class MapReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public MapReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// Each item's operation ID must equal SHA-256(parentOpId + "-" + (index+1)) + /// (matching OperationIdGenerator's CreateChild contract). Reproduced locally + /// because OperationIdGenerator is internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// Three map items, each containing a step + a durable wait (the wait forces + /// a suspend/resume cycle so the map actually replays). Verifies: + /// 1. The item operation IDs match the deterministic + /// SHA256("<parentId>-<n>") formula (the same one used by + /// OperationIdGenerator.CreateChild and the reference Java/JS/Python SDKs). + /// 2. Each item's user-visible step result is preserved across replay (the + /// GUID generated inside generate survives suspend/resume). + /// + [Fact] + public async Task Map_ItemOperationIds_AreDeterministic_AcrossReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("MapReplayDeterminismFunction"), + "mreplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "m6"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The map parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var expectedItemIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + + // Wait until each item's CONTEXT SUCCEEDED is visible AND each item's + // step/wait events are visible (they live under the item operation IDs). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 4) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Item operation IDs match the deterministic hash. + var itemStartedEvents = allEvents + .Where(e => e.EventType == EventType.ContextStarted && e.Id != null && e.Id != parentOpId) + .ToList(); + var observedItemIds = itemStartedEvents.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedItemIds.Count); + foreach (var expected in expectedItemIds) + { + Assert.Contains(expected, observedItemIds); + } + + // 2. Each item's CONTEXT succeeded (parent named "fanout" excluded). + var itemSucceededEvents = allEvents + .Where(e => e.EventType == EventType.ContextSucceeded && e.Name != "fanout") + .ToList(); + Assert.Equal(3, itemSucceededEvents.Count); + + // 3. Each item's "generate" step succeeded exactly once — proving replay + // returned the cached step result rather than re-executing. + var stepSucceededEvents = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, stepSucceededEvents.Count); + + // 4. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response contains the per-item step results + // (proving they survived replay). + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs new file mode 100644 index 000000000..77305ebef --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFailureToleranceTest.cs @@ -0,0 +1,70 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFailureToleranceTest +{ + private readonly ITestOutputHelper _output; + public ParallelFailureToleranceTest(ITestOutputHelper output) => _output = output; + + /// + /// Five branches, two fail, ToleratedFailureCount=1. The parallel must surface a + /// with reason + /// ; the workflow must + /// terminate FAILED. Validates the failure-tolerance short-circuit and that + /// ParallelException propagates as the workflow's terminal error. + /// + [Fact] + public async Task Parallel_FailureToleranceExceeded_FailsWorkflow() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFailureToleranceFunction"), + "ptol", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p3"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + // Failed workflows return null payload to the Invoke caller — locate the + // execution by name to inspect its terminal status. + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("FAILED", status, ignoreCase: true); + + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Error); + // ParallelException is the terminal error type the SDK throws when the + // failure-tolerance short-circuit fires. + var errorType = execution.Error.ErrorType ?? string.Empty; + var errorMessage = execution.Error.ErrorMessage ?? string.Empty; + Assert.True( + errorType.Contains("ParallelException", StringComparison.Ordinal) + || errorMessage.Contains("Parallel", StringComparison.OrdinalIgnoreCase), + $"Expected error to indicate ParallelException; got type='{errorType}' message='{errorMessage}'"); + + // History: parent CONTEXT and at least 2 failed branch contexts visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 3 + && (h.Events?.Count(e => e.EventType == EventType.ContextFailed) ?? 0) >= 2, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // At least 2 branches failed (the third may or may not have been + // dispatched depending on race; the parent CONTEXT itself also fails). + Assert.True( + events.Count(e => e.EventType == EventType.ContextFailed) >= 2, + $"Expected >= 2 ContextFailed events; got {events.Count(e => e.EventType == EventType.ContextFailed)}"); + + // The parent context (named "tolerance") records the aggregate failure. + var parentFailed = events.FirstOrDefault(e => + e.EventType == EventType.ContextFailed && e.Name == "tolerance"); + Assert.NotNull(parentFailed); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs new file mode 100644 index 000000000..fedc538fb --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFirstSuccessfulTest.cs @@ -0,0 +1,81 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFirstSuccessfulTest +{ + private readonly ITestOutputHelper _output; + public ParallelFirstSuccessfulTest(ITestOutputHelper output) => _output = output; + + /// + /// Four branches with staggered durable waits, FirstSuccessful: as + /// soon as one branch completes, the parallel resolves. In-flight branches + /// remain in rather than being + /// cancelled. Validates the cross-cutting decision: orphan branches are NOT + /// cancelled, and short-circuit reports them as Started. + /// + [Fact] + public async Task Parallel_FirstSuccessful_ShortCircuitsOnFirstWin() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFirstSuccessfulFunction"), + "pfirst", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p4"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // Wait timer = 8s, plus invocation overhead. Generous timeout for + // CI variance. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The workflow's response payload reports the winning branch. + using var doc = JsonDocument.Parse(responsePayload); + var winnerIndex = doc.RootElement.GetProperty("WinnerIndex").GetInt32(); + var winnerName = doc.RootElement.GetProperty("WinnerName").GetString(); + var completionReason = doc.RootElement.GetProperty("CompletionReason").GetString(); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + + // At least one branch succeeded — the workflow short-circuited as soon + // as the first win materialised. + Assert.True(successCount >= 1, $"Expected >= 1 successful branch, got {successCount}"); + Assert.True(winnerIndex >= 0 && winnerIndex < 4, + $"WinnerIndex should be a valid branch index, got {winnerIndex}"); + Assert.NotNull(winnerName); + + // CompletionReason is MinSuccessfulReached only if some branch was left + // un-dispatched at the time the threshold was met. With unbounded + // concurrency every branch dispatches immediately, so the reason is + // AllCompleted (all dispatched branches finished). Either reason is + // acceptable — just ensure it isn't FailureToleranceExceeded. + Assert.NotEqual("FailureToleranceExceeded", completionReason); + + // Service-side: the parent CONTEXT and at least one branch CONTEXT + // succeeded. Other branches' final state is timing-dependent — they + // could be Started (left in flight) or Succeeded (completed before + // the parent's CONTEXT SUCCEED was flushed). The orchestrator + // deliberately does not cancel in-flight branches once the + // short-circuit fires. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Any(e => e.EventType == EventType.ContextSucceeded && e.Name == "race") ?? false), + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + var parentSucceeded = events.FirstOrDefault(e => + e.EventType == EventType.ContextSucceeded && e.Name == "race"); + Assert.NotNull(parentSucceeded); + + // The winning branch's CONTEXT SUCCEEDED is in the history. + Assert.Contains(events, e => e.EventType == EventType.ContextSucceeded && e.Name == winnerName); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs new file mode 100644 index 000000000..0f3450aa2 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatNestingTest.cs @@ -0,0 +1,135 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFlatNestingTest +{ + private readonly ITestOutputHelper _output; + public ParallelFlatNestingTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Branch op ids + /// are SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way + /// under the branch op id. Reproduced locally because OperationIdGenerator is + /// internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end parallel: three branches, each + /// with a step + a durable wait (the wait forces a suspend/resume cycle so the + /// parallel actually replays). Verifies the Flat-specific contract against the + /// real durable-execution service: + /// 1. NO per-branch CONTEXT events are emitted — only the parent Parallel + /// CONTEXT. (Under Nested there would be 4 ContextStarted; under Flat, + /// exactly 1.) + /// 2. Each branch's inner step/wait ops RE-PARENT to the Parallel op (the + /// nearest non-virtual ancestor), since the virtual branch emits no + /// CONTEXT checkpoint to reference as a parent. + /// 3. Inner-op ids are still derived from the branch op id (so the two + /// branches' first steps don't collide), even though they report the + /// Parallel op as parent. + /// 4. The per-branch result survives replay (the GUID generated inside + /// generate is preserved across suspend/resume — read back from the + /// inline parent payload, not a per-branch checkpoint). + /// + [Fact] + public async Task Parallel_Flat_SuppressesBranchContexts_AndReparentsInnerOps() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFlatNestingFunction"), + "pflat", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "pf1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var branchOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each branch's "generate" step is the 1st inner op under that branch's + // own id space: SHA256("-1"). + var expectedStepIds = branchOpIds.Select(b => HashOpId($"{b}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three branches' inner + // step + wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Exactly ONE CONTEXT operation exists — the parent Parallel op. No + // per-branch CONTEXT events under Flat. + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && branchOpIds.Contains(e.Id))); + + // 2. Each branch's "generate" step re-parents to the Parallel op (NOT to + // its virtual branch op). + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + // 3. ...but the step ids are still derived from the per-branch id space, + // so the three branches' first steps are distinct and match the expected + // SHA256("-1") values. + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. The "generate" step succeeded exactly once per branch — proving + // replay returned the cached result rather than re-executing. + Assert.Equal(3, generateSteps.Count); + + // 5. The wait events span at least 2 invocations (suspend + resume), + // proving replay actually happened with no per-branch checkpoint. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 6. The user-visible response carries the joined per-branch results. + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs new file mode 100644 index 000000000..21db02c6f --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelFlatOverflowTest.cs @@ -0,0 +1,161 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelFlatOverflowTest +{ + private readonly ITestOutputHelper _output; + public ParallelFlatOverflowTest(ITestOutputHelper output) => _output = output; + + /// + /// Reproduces the deterministic operation ID the SDK assigns. Branch op ids + /// are SHA-256(parentOpId + "-" + (index+1)); inner-op ids nest the same way + /// under the branch op id. Reproduced locally because OperationIdGenerator is + /// internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// End-to-end exercise of the LARGE-PAYLOAD OVERFLOW + ReplayChildren replay path + /// for a parallel. + /// + /// Three branches each return a deterministic ~150 KB string (~450 KB aggregate), + /// which exceeds the 256 KB checkpoint threshold, so the parallel OVERFLOWS: the SDK + /// checkpoints a STRIPPED summary (no inline results) and sets + /// ContextOptions.ReplayChildren=true on the parent CONTEXT op. + /// + /// The workflow is shaped to actually drive the RECOVERY path (ReplayChildrenAsync): + /// - invoke 1: branches suspend on their in-branch waits -> PENDING. + /// - invoke 2: the parallel re-runs the branches, overflow-checkpoints the parent + /// as SUCCEEDED + ReplayChildren, then suspends on the post-parallel + /// "post-overflow" wait (so the parallel does NOT also return in this invoke). + /// - invoke 3: re-enters the already-terminal SUCCEEDED + ReplayChildren parallel, + /// routing through ReplayChildrenAsync to RE-EXECUTE the branch bodies and + /// recover the stripped values (reading per-unit Status/CompletionReason from the + /// frozen summary, never re-checkpointing). The final result is computed from + /// those recovered values. + /// + /// This test proves the whole path works against the real durable-execution service: + /// 1. The execution SUCCEEDED — proving the overflow checkpoint was accepted AND + /// ReplayChildrenAsync correctly reconstructed the aggregate result. (If the + /// ReplayChildren recovery path were broken, reconstruction would fail and the + /// execution would FAIL/TIME_OUT.) + /// 2. Exactly ONE parent CONTEXT op exists — Flat emits no per-branch CONTEXT. + /// 3. The three "generate" steps succeeded and re-parent to the Parallel op. + /// 4. There were >= 3 InvocationCompleted events (initial PENDING + the resume that + /// overflow-checkpoints the parallel + the post-overflow resume that runs + /// ReplayChildrenAsync) — proving the parallel was re-entered while terminal, so + /// the ReplayChildren recovery path really ran. + /// 5. The FINAL execution result (read via GetExecutionAsync after SUCCEEDED, not + /// the first PENDING invoke response) reports the recovered per-branch lengths + /// ("153600" x3) and first chars ("abc") — proving the large deterministic + /// values were recovered EXACTLY by ReplayChildrenAsync, not lost or defaulted. + /// + [Fact] + public async Task Parallel_Flat_Overflow_ReplaysChildren_AndRecoversLargeResults() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelFlatOverflowFunction"), + "pflow", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "po1"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // SUCCEEDED alone proves the >256 KB overflow checkpoint was accepted and that + // ReplayChildrenAsync (re-entered on the post-overflow resume) reconstructed the + // result. A broken overflow recovery would FAIL or TIME_OUT here. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var branchOpIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + // Each branch's "generate" step is the 1st inner op under that branch's own id + // space: SHA256("-1"). + var expectedStepIds = branchOpIds.Select(b => HashOpId($"{b}-1")).ToList(); + + // Wait until the parent CONTEXT succeeded and all three branches' inner step + + // wait events are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 1) return false; + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 2. Exactly ONE CONTEXT operation exists — the parent Parallel op. No + // per-branch CONTEXT events under Flat (even on the overflow path). + var contextStartedIds = allEvents + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Id) + .Distinct() + .ToList(); + Assert.Equal(new[] { parentOpId }, contextStartedIds); + Assert.Empty(allEvents.Where(e => + e.EventType == EventType.ContextStarted && branchOpIds.Contains(e.Id))); + + // 3. Each branch's "generate" step re-parents to the Parallel op (NOT to its + // virtual branch op), and the three step ids match the per-branch id space. + var generateSteps = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, generateSteps.Count); + Assert.All(generateSteps, e => Assert.Equal(parentOpId, e.ParentId)); + + var observedStepIds = generateSteps.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedStepIds.Count); + foreach (var expected in expectedStepIds) + { + Assert.Contains(expected, observedStepIds); + } + + // 4. There are at least 3 invocations: the initial PENDING, the resume that + // overflow-checkpoints the parallel and suspends on the post-overflow wait, and + // the post-overflow resume that re-enters the already-terminal parallel and runs + // ReplayChildrenAsync. >= 3 proves the parallel was re-entered while terminal, so + // the ReplayChildren recovery path really ran (>= 2 alone would only prove a + // single suspend/resume cycle). + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 3, + $"Expected >= 3 InvocationCompleted events (initial + overflow-checkpoint resume + post-overflow ReplayChildren resume), got {invocations.Count}"); + + // 5. The FINAL execution result (NOT the first invoke response, which is PENDING + // because the branch waits suspend it) reports the recovered per-branch metadata. + // Each branch produced a 150 KB (153600-byte) string built from its branch char, + // so a correct ReplayChildrenAsync recovery yields lengths "153600,153600,153600" + // and first chars "abc". This proves the large values were recovered EXACTLY by + // the ReplayChildren path, not lost or defaulted. + var execution = await deployment.GetExecutionAsync(arn!); + Assert.NotNull(execution.Result); + Assert.Contains("\"Lengths\":\"153600,153600,153600\"", execution.Result, StringComparison.OrdinalIgnoreCase); + Assert.Contains("\"FirstChars\":\"abc\"", execution.Result, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs new file mode 100644 index 000000000..0895f8796 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelHappyPathTest.cs @@ -0,0 +1,72 @@ +using System.Linq; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelHappyPathTest +{ + private readonly ITestOutputHelper _output; + public ParallelHappyPathTest(ITestOutputHelper output) => _output = output; + + /// + /// End-to-end happy-path parallel: three branches run concurrently, each + /// produces a string, and the workflow returns the joined results. Validates + /// the parent CONTEXT and per-branch CONTEXT checkpoints all land in the + /// service-side history with the correct names and ordering. + /// + [Fact] + public async Task Parallel_AllBranchesSucceed() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelHappyPathFunction"), + "phappy", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p1"}"""); + Assert.Equal(200, invokeResponse.StatusCode); + + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The user-visible payload contains all three branch outputs in + // declaration order (the SDK preserves index order even when branches + // race). + Assert.Contains("alpha-p1", responsePayload); + Assert.Contains("beta-p1", responsePayload); + Assert.Contains("gamma-p1", responsePayload); + + // History is eventually consistent — wait until the parent CONTEXT and + // all three child CONTEXT checkpoints are visible. + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 4, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + // Parent + 3 branches = 4 ContextStarted, 4 ContextSucceeded. + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextSucceeded)); + + // The three branches show up by name on their own ContextStarted events. + var startedNames = events + .Where(e => e.EventType == EventType.ContextStarted) + .Select(e => e.Name) + .ToList(); + Assert.Contains("fanout", startedNames); + Assert.Contains("alpha", startedNames); + Assert.Contains("beta", startedNames); + Assert.Contains("gamma", startedNames); + + // No branch failed. + Assert.Empty(events.Where(e => e.EventType == EventType.ContextFailed)); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs new file mode 100644 index 000000000..e228cdc22 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelMaxConcurrencyTest.cs @@ -0,0 +1,76 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelMaxConcurrencyTest +{ + private readonly ITestOutputHelper _output; + public ParallelMaxConcurrencyTest(ITestOutputHelper output) => _output = output; + + /// + /// 6 branches, each with a 2-second durable wait, MaxConcurrency = 2. + /// Validates the semaphore actually throttles dispatch: timestamps must + /// cluster into 3 waves of 2 (not all six firing simultaneously). Timing + /// tolerance is intentionally generous (±2s per wave gap) to avoid CI + /// flakiness; if the wave-clustering proves flaky, fall back to + /// "all 6 succeeded". + /// + [Fact] + public async Task Parallel_MaxConcurrency_ThrottlesBranchDispatch() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelMaxConcurrencyFunction"), + "pmaxc", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p5"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + // 3 waves x 2s waits + invocation overhead. Allow generous headroom + // for service scheduling latency. + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(180)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + Assert.Equal(6, successCount); + + var timestamps = doc.RootElement.GetProperty("Timestamps") + .EnumerateArray().Select(t => t.GetInt64()).ToList(); + Assert.Equal(6, timestamps.Count); + + // Sort timestamps and check whether they cluster into 3 groups of 2. + // Wave-N timestamps should be roughly 2s apart from wave-(N-1). + // Use generous tolerance (±1500ms within a wave; >= 800ms gap between + // waves) — service-driven invocations have observable jitter. + var sorted = timestamps.OrderBy(t => t).ToList(); + var minTs = sorted[0]; + var relative = sorted.Select(t => t - minTs).ToList(); + _output.WriteLine($"Relative timestamps (ms): {string.Join(", ", relative)}"); + + // Tolerant clustering: split timestamps by 1500ms gaps. With + // MaxConcurrency=2 and 2s waits, we expect at least 2 distinct waves. + // Strict 3-wave clustering can be flaky due to service jitter, so we + // assert the weaker (but still meaningful) property: not all 6 + // branches fired in the same wave. + var firstWave = relative.Where(r => r < 1500).Count(); + Assert.True(firstWave <= 3, + $"Expected MaxConcurrency=2 to limit the first wave to ~2 branches; got {firstWave} within 1500ms of start. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + + // The full set must span at least one wave-gap (~2s) — i.e., total + // elapsed must exceed ~2s, proving branches did NOT all run at once. + var total = sorted[^1] - sorted[0]; + Assert.True(total >= 1500, + $"Expected branches to span >= 1500ms (proves throttling); got {total}ms. " + + $"Relative timestamps: [{string.Join(", ", relative)}]"); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs new file mode 100644 index 000000000..28adf7549 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelPartialFailureTest.cs @@ -0,0 +1,74 @@ +using System.Linq; +using System.Text; +using System.Text.Json; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelPartialFailureTest +{ + private readonly ITestOutputHelper _output; + public ParallelPartialFailureTest(ITestOutputHelper output) => _output = output; + + /// + /// Three branches, one throws, two succeed. With CompletionConfig.AllCompleted() + /// the parallel does NOT throw — it surfaces success/failure counts and the + /// per-branch errors. Validates per-branch error preservation through the + /// service round-trip and back into the rebuilt . + /// + [Fact] + public async Task Parallel_PartialFailure_AllCompleted_ReportsCounts() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelPartialFailureFunction"), + "ppartial", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p2"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(60)); + // AllCompleted means partial failure is NOT a workflow failure — the + // user accepted the failure and returned a result. + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // Decode the workflow result payload and verify the counts surface correctly. + using var doc = JsonDocument.Parse(responsePayload); + var successCount = doc.RootElement.GetProperty("SuccessCount").GetInt32(); + var failureCount = doc.RootElement.GetProperty("FailureCount").GetInt32(); + var errorSummary = doc.RootElement.GetProperty("ErrorSummary").GetString(); + + Assert.Equal(2, successCount); + Assert.Equal(1, failureCount); + Assert.NotNull(errorSummary); + // The originating exception type is captured on the rebuilt + // ChildContextException when reconstructing the batch. + Assert.Contains("intentional partial failure", errorSummary); + + // History: 1 parent + 3 branches = 4 ContextStarted; 3 ContextSucceeded + // (parent + 2 ok branches); 1 ContextFailed (the boom branch). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => (h.Events?.Count(e => e.EventType == EventType.ContextStarted) ?? 0) >= 4 + && (h.Events?.Any(e => e.EventType == EventType.ContextFailed) ?? false) + && (h.Events?.Count(e => e.EventType == EventType.ContextSucceeded) ?? 0) >= 3, + TimeSpan.FromSeconds(60)); + var events = history.Events ?? new List(); + + Assert.Equal(4, events.Count(e => e.EventType == EventType.ContextStarted)); + Assert.Equal(3, events.Count(e => e.EventType == EventType.ContextSucceeded)); + Assert.Equal(1, events.Count(e => e.EventType == EventType.ContextFailed)); + + // The failing branch's checkpoint preserves the exception message. + var failedEvent = events.SingleOrDefault(e => e.EventType == EventType.ContextFailed); + Assert.NotNull(failedEvent); + Assert.Equal("boom", failedEvent!.Name); + Assert.Contains("intentional partial failure", + failedEvent.ContextFailedDetails?.Error?.Payload?.ErrorMessage ?? string.Empty); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs new file mode 100644 index 000000000..1ad44790a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/ParallelReplayDeterminismTest.cs @@ -0,0 +1,122 @@ +using System.Linq; +using System.Security.Cryptography; +using System.Text; +using Amazon.Lambda.Model; +using Xunit; +using Xunit.Abstractions; + +namespace Amazon.Lambda.DurableExecution.IntegrationTests; + +public class ParallelReplayDeterminismTest +{ + private readonly ITestOutputHelper _output; + public ParallelReplayDeterminismTest(ITestOutputHelper output) => _output = output; + + /// + /// Each branch's operation ID must equal SHA-256(parentOpId + "-" + (index+1)) + /// (matching the OperationIdGenerator's CreateChild contract). Reproduced + /// locally because OperationIdGenerator is internal to the SDK. + /// + private static string HashOpId(string raw) + { + var bytes = Encoding.UTF8.GetBytes(raw); + var hash = SHA256.HashData(bytes); + var sb = new StringBuilder(hash.Length * 2); + foreach (var b in hash) sb.Append(b.ToString("x2")); + return sb.ToString(); + } + + /// + /// Three parallel branches, each containing a step + a durable wait + /// (the wait forces a suspend/resume cycle so the parallel actually + /// replays). Verifies: + /// 1. The branch operation IDs match the deterministic + /// SHA256("<parentId>-<n>") formula (the same one used + /// by OperationIdGenerator.CreateChild and the reference Java/JS/Python SDKs). + /// 2. Each branch's user-visible step result is preserved across replay + /// (the GUID generated inside generate survives suspend/resume). + /// + [Fact] + public async Task Parallel_BranchOperationIds_AreDeterministic_AcrossReplay() + { + await using var deployment = await DurableFunctionDeployment.CreateAsync( + DurableFunctionDeployment.FindTestFunctionDir("ParallelReplayDeterminismFunction"), + "preplay", _output); + + var (invokeResponse, executionName) = await deployment.InvokeAsync("""{"orderId": "p6"}"""); + var responsePayload = Encoding.UTF8.GetString(invokeResponse.Payload.ToArray()); + _output.WriteLine($"Response: {responsePayload}"); + + var arn = await deployment.FindDurableExecutionArnByNameAsync(executionName, TimeSpan.FromSeconds(60)); + Assert.NotNull(arn); + + var status = await deployment.PollForCompletionAsync(arn!, TimeSpan.FromSeconds(120)); + Assert.Equal("SUCCEEDED", status, ignoreCase: true); + + // The parallel parent is the first root-level operation -> SHA256("1"). + var parentOpId = HashOpId("1"); + var expectedBranchIds = new[] + { + HashOpId($"{parentOpId}-1"), + HashOpId($"{parentOpId}-2"), + HashOpId($"{parentOpId}-3"), + }; + + // Wait until each branch's CONTEXT SUCCEEDED is visible AND each + // branch's step/wait events are visible (they live under the branch + // operation IDs). + var history = await deployment.WaitForHistoryAsync( + arn!, + h => + { + var events = h.Events ?? new List(); + // Parent + 3 branch CONTEXTs all succeeded. + if (events.Count(e => e.EventType == EventType.ContextSucceeded) < 4) return false; + // Each branch ran one step and one wait => 3 step succeeds + 3 wait succeeds. + if (events.Count(e => e.EventType == EventType.StepSucceeded) < 3) return false; + if (events.Count(e => e.EventType == EventType.WaitSucceeded) < 3) return false; + return true; + }, + TimeSpan.FromSeconds(60)); + var allEvents = history.Events ?? new List(); + + // 1. Branch operation IDs match the deterministic hash. + var branchStartedEvents = allEvents + .Where(e => e.EventType == EventType.ContextStarted && e.Id != null && e.Id != parentOpId) + .ToList(); + var observedBranchIds = branchStartedEvents.Select(e => e.Id).Distinct().ToList(); + Assert.Equal(3, observedBranchIds.Count); + foreach (var expected in expectedBranchIds) + { + Assert.Contains(expected, observedBranchIds); + } + + // 2. Every step under a branch parents to that branch's deterministic ID + // (proves the child generator's ID space is correctly seeded). + var branchSucceededEvents = allEvents + .Where(e => e.EventType == EventType.ContextSucceeded && e.Name != "fanout") + .ToList(); + Assert.Equal(3, branchSucceededEvents.Count); + + // 3. Each branch's "generate" step succeeded exactly once — proving + // replay returned the cached step result rather than re-executing. + // (Re-execution would manifest as duplicate StepSucceeded events for + // the same operation ID.) + var stepSucceededEvents = allEvents + .Where(e => e.EventType == EventType.StepSucceeded && e.Name == "generate") + .ToList(); + Assert.Equal(3, stepSucceededEvents.Count); + + // 4. The wait events span at least 2 invocations: one to schedule each + // wait, and at least one to resume after the timer fires. This proves + // replay actually happened. + var invocations = allEvents.Where(e => e.InvocationCompletedDetails != null).ToList(); + Assert.True( + invocations.Count >= 2, + $"Expected >= 2 InvocationCompleted events (suspend + resume), got {invocations.Count}"); + + // 5. The user-visible response contains 3 valid GUIDs separated by commas + // (proving the per-branch step result survived replay). + Assert.Contains("\"data\"", responsePayload, StringComparison.OrdinalIgnoreCase); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs new file mode 100644 index 000000000..62712b6a4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/Function.cs @@ -0,0 +1,55 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Five items, two throw. ToleratedFailureCount = 1 means a second failure + // exceeds tolerance and the map surfaces a MapException — terminating the + // workflow FAILED. + var items = new[] { "ok1", "bad1", "ok2", "bad2", "ok3" }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.CompletedTask; + if (item.StartsWith("bad")) + throw new InvalidOperationException($"{item} boom"); + return item; + }, + name: "tolerance", + config: new MapConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Should not reach here — the map must throw MapException. + return new TestResult { Status = "should_not_reach", SuccessCount = batch.SuccessCount }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFailureToleranceFunction/MapFailureToleranceFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs new file mode 100644 index 000000000..d083a054b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Four items, each waits a different (durable) duration. The shortest + // wait should win and short-circuit the map via FirstSuccessful. Wait + // durations are at least 1s (service timer granularity). The item value + // IS the wait-seconds; the result is the item's index. + var waitSeconds = new[] { 8, 1, 5, 6 }; + + var batch = await context.MapAsync( + waitSeconds, + async (ctx, seconds, index, all) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(seconds), name: $"wait_{index}"); + return index; + }, + name: "race", + config: new MapConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + var winner = batch.Succeeded.FirstOrDefault(); + return new TestResult + { + Status = "completed", + WinnerIndex = winner?.Index ?? -1, + WinnerName = winner?.Name, + CompletionReason = batch.CompletionReason.ToString(), + SuccessCount = batch.SuccessCount, + StartedCount = batch.StartedCount + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int WinnerIndex { get; set; } + public string? WinnerName { get; set; } + public string? CompletionReason { get; set; } + public int SuccessCount { get; set; } + public int StartedCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFirstSuccessfulFunction/MapFirstSuccessfulFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs new file mode 100644 index 000000000..9cd54aaba --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/Function.cs @@ -0,0 +1,57 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items run under NestingType.Flat. Each item generates a fresh + // GUID inside a step, then does a durable wait. The wait forces a + // suspend/resume cycle, so the second invocation MUST replay the cached + // per-item result — and under Flat that result lives inline on the parent + // Map payload, not on a per-item CONTEXT checkpoint (none are emitted). + // If Flat replay is broken, the GUID would change between the original + // execution and replay, or the inner step/wait ops would reference a + // non-existent item parent. + var items = new[] { 0, 1, 2 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the map. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + }, + name: "fanout", + config: new MapConfig { NestingType = NestingType.Flat }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapFlatNestingFunction/MapFlatNestingFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs new file mode 100644 index 000000000..14da119f8 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/Function.cs @@ -0,0 +1,45 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var orders = new[] { "order-1", "order-2", "order-3" }; + + // Each item is processed inside a step so the per-item child context + // owns a leaf operation. ItemNamer gives each item a readable branch + // name in the service-side history. + var batch = await context.MapAsync( + orders, + async (ctx, orderId, index, all) => + await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return $"{orderId}-{input.OrderId}"; }, + name: "process"), + name: "process_all", + config: new MapConfig { ItemNamer = (item, index) => $"item-{item}" }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapHappyPathFunction/MapHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs new file mode 100644 index 000000000..0499a7a93 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // 6 items, MaxConcurrency = 2. Each item does a 2-second durable wait + // then captures the post-wait wall-clock as a unix-ms timestamp. The + // expected outcome is 3 waves of 2 items; total elapsed ~6s. Use + // IDurableContext.WaitAsync (not Task.Delay) — Task.Delay is NOT durable + // and would skew this measurement under replay. + var items = new[] { 0, 1, 2, 3, 4, 5 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: $"wait_{index}"); + return DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + }, + name: "throttled", + config: new MapConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + Timestamps = batch.GetResults().ToArray() + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public long[]? Timestamps { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapMaxConcurrencyFunction/MapMaxConcurrencyFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs new file mode 100644 index 000000000..39676c3ed --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/Function.cs @@ -0,0 +1,63 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items, the middle one throws. Map's DEFAULT CompletionConfig is + // AllCompleted() (permissive) — unlike Parallel's AllSuccessful() — so NO + // config is supplied here and the map must still drive every item to a + // terminal state without throwing. This is the key Map-vs-Parallel + // behavioral difference, validated end-to-end. + var items = new[] { "ok1", "boom", "ok2" }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.CompletedTask; + if (item == "boom") + throw new InvalidOperationException("intentional partial failure"); + return item; + }, + name: "partial"); + + var errors = batch.GetErrors(); + var errorSummary = string.Join("|", errors.Select(e => $"{e.GetType().Name}:{e.Message}")); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + FailureCount = batch.FailureCount, + ErrorSummary = errorSummary + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public int FailureCount { get; set; } + public string? ErrorSummary { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapPartialFailureFunction/MapPartialFailureFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..9a75cbd5e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/Function.cs @@ -0,0 +1,53 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three items. Each item generates a fresh GUID inside a step, then does + // a durable wait. The wait forces a suspend/resume cycle, so the second + // invocation MUST replay the cached GUID rather than re-running the step. + // If replay determinism is broken, the GUID would change between the + // original execution and replay. + var items = new[] { 0, 1, 2 }; + + var batch = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the map. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/MapReplayDeterminismFunction/MapReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs new file mode 100644 index 000000000..9c697710d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/Function.cs @@ -0,0 +1,60 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Five branches, two throw. ToleratedFailureCount = 1 means a second + // failure exceeds tolerance and the parallel surfaces a ParallelException. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("ok1", async (_) => { await Task.CompletedTask; return "1"; }), + new DurableBranch("bad1", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("bad1 boom"); + }), + new DurableBranch("ok2", async (_) => { await Task.CompletedTask; return "2"; }), + new DurableBranch("bad2", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("bad2 boom"); + }), + new DurableBranch("ok3", async (_) => { await Task.CompletedTask; return "3"; }), + }, + name: "tolerance", + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Should not reach here — the parallel must throw ParallelException. + return new TestResult { Status = "should_not_reach", SuccessCount = batch.SuccessCount }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFailureToleranceFunction/ParallelFailureToleranceFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs new file mode 100644 index 000000000..2fa932dd7 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/Function.cs @@ -0,0 +1,79 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Four branches with different durable wait durations. The shortest + // wait should win and short-circuit the parallel via FirstSuccessful. + // Wait durations are at least 1s (service timer granularity). + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("slowest", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(8), name: "wait_3"); + return 3; + }), + new DurableBranch("fastest", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(1), name: "wait_0"); + return 0; + }), + new DurableBranch("mid1", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(5), name: "wait_1"); + return 1; + }), + new DurableBranch("mid2", async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(6), name: "wait_2"); + return 2; + }), + }, + name: "race", + config: new ParallelConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + // The winner is whichever branch came back first. Surface the index + + // its name so the test can assert one branch won. + var winner = batch.Succeeded.FirstOrDefault(); + return new TestResult + { + Status = "completed", + WinnerIndex = winner?.Index ?? -1, + WinnerName = winner?.Name, + CompletionReason = batch.CompletionReason.ToString(), + SuccessCount = batch.SuccessCount, + StartedCount = batch.StartedCount + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int WinnerIndex { get; set; } + public string? WinnerName { get; set; } + public string? CompletionReason { get; set; } + public int SuccessCount { get; set; } + public int StartedCount { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFirstSuccessfulFunction/ParallelFirstSuccessfulFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs new file mode 100644 index 000000000..dfbd6a345 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches run under NestingType.Flat. Each branch generates a + // fresh GUID inside a step, then does a durable wait. The wait forces a + // suspend/resume cycle, so the second invocation MUST replay the cached + // per-branch result — and under Flat that result lives inline on the + // parent Parallel payload, not on a per-branch CONTEXT checkpoint (none + // are emitted). If Flat replay is broken, the GUID would change between + // the original execution and replay, or the inner step/wait ops would + // reference a non-existent branch parent. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", BranchAsync), + new DurableBranch("b", BranchAsync), + new DurableBranch("c", BranchAsync), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } + + private static async Task BranchAsync(IDurableContext ctx) + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatNestingFunction/ParallelFlatNestingFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs new file mode 100644 index 000000000..77b8e7b4d --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/Function.cs @@ -0,0 +1,103 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + // Each branch produces a ~150 KB string. Three branches => ~450 KB of inline + // results, comfortably over the 256 KB checkpoint threshold. This forces the + // FLAT parallel aggregate to OVERFLOW: the SDK checkpoints a stripped summary + // (no inline results) and sets ContextOptions.ReplayChildren=true on the parent + // CONTEXT op, keeping the full result in memory for the current invoke. + private const int BranchPayloadSize = 150 * 1024; // 153600 bytes + + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches run under NestingType.Flat. Each branch generates a LARGE + // (~150 KB) string inside a step, then does an in-branch durable wait. The + // combined ~450 KB aggregate exceeds the 256 KB threshold, so the parallel + // OVERFLOWS: the SDK checkpoints a stripped summary (no inline per-branch + // results) + ReplayChildren=true on the parent CONTEXT op. + // + // To actually exercise the RECOVERY path (ReplayChildrenAsync), the + // already-overflowed parallel must be re-entered on a FRESH invoke while it is + // already terminal (SUCCEEDED + ReplayChildren). The in-branch waits alone are + // NOT enough: the resume invoke that overflow-checkpoints the parallel also + // immediately returns SUCCEEDED, so the parallel goes STARTED -> SUCCEEDED in a + // single invoke and ReplayChildrenAsync is never hit. So we add a durable wait + // AFTER ParallelAsync returns (the "post-overflow" wait below): the overflow + // invoke suspends on that wait, and the NEXT invoke re-enters the already- + // terminal overflowed parallel and routes through ReplayChildrenAsync to + // RE-EXECUTE the branch bodies and recover the stripped values (reading per-unit + // Status/CompletionReason from the frozen summary, never re-checkpointing). + // + // The branch values are built DETERMINISTICALLY from the branch character + // (NOT Guid/random/DateTime). This is critical: the value produced on the + // original execution must be IDENTICAL to the value produced on replay + // re-execution, so the test can prove the large values were recovered exactly + // rather than lost or defaulted. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", ctx => BranchAsync(ctx, 'a')), + new DurableBranch("b", ctx => BranchAsync(ctx, 'b')), + new DurableBranch("c", ctx => BranchAsync(ctx, 'c')), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + // Force another invocation so the already-overflowed parallel is re-entered + // (already SUCCEEDED + ReplayChildren) and replayed via ReplayChildrenAsync, + // which re-executes the branch bodies to recover the stripped >256 KB results. + await context.WaitAsync(TimeSpan.FromSeconds(1), name: "post-overflow"); + + // Compute the verifiable metadata AFTER the post-overflow wait: on the final + // invoke these results come from ReplayChildrenAsync's re-execution, which is + // exactly the recovery we want to prove survives. + var results = batch.GetResults().ToList(); + + // Keep the returned payload SMALL (well under the 6 MB Lambda response + // limit): do NOT echo the ~450 KB back. Instead return verifiable metadata + // proving the large values were recovered on replay: + // - Lengths: comma-joined per-branch result LENGTHS (e.g. "153600,153600,153600") + // - FirstChars: the first character of each recovered branch result, in order + // (e.g. "abc") — confirms each branch's deterministic content survived. + var lengths = string.Join(",", results.Select(r => r.Length)); + var firstChars = string.Concat(results.Select(r => r.Length > 0 ? r[0] : '?')); + + return new TestResult { Status = "completed", Lengths = lengths, FirstChars = firstChars }; + } + + private static async Task BranchAsync(IDurableContext ctx, char branchChar) + { + // Deterministic large payload: same branchChar => same string on original + // execution and on replay re-execution. ~150 KB per branch. + var large = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return new string(branchChar, BranchPayloadSize); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the (overflowed) parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return large; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Lengths { get; set; } public string? FirstChars { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj new file mode 100644 index 000000000..f8bf7fd0c --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelFlatOverflowFunction/ParallelFlatOverflowFunction.csproj @@ -0,0 +1,18 @@ + + + + net10.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs new file mode 100644 index 000000000..b6b027f9b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/Function.cs @@ -0,0 +1,40 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("alpha", async (_) => { await Task.CompletedTask; return $"alpha-{input.OrderId}"; }), + new DurableBranch("beta", async (_) => { await Task.CompletedTask; return $"beta-{input.OrderId}"; }), + new DurableBranch("gamma", async (_) => { await Task.CompletedTask; return $"gamma-{input.OrderId}"; }), + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelHappyPathFunction/ParallelHappyPathFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs new file mode 100644 index 000000000..72f69913a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/Function.cs @@ -0,0 +1,67 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // 6 branches, MaxConcurrency = 2. Each branch does a 2-second durable + // wait then captures the post-wait wall-clock as a unix-ms timestamp. + // The expected outcome is 3 waves of 2 branches; total elapsed ~6s. + // Use IDurableContext.WaitAsync (not Task.Delay) — Task.Delay is NOT + // durable and would skew this measurement under replay. + var branches = new DurableBranch[6]; + for (var i = 0; i < 6; i++) + { + var localIndex = i; + branches[i] = new DurableBranch( + $"b{localIndex}", + async (ctx) => + { + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: $"wait_{localIndex}"); + return DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + }); + } + + var batch = await context.ParallelAsync( + branches, + name: "throttled", + config: new ParallelConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + Timestamps = batch.GetResults().ToArray() + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public long[]? Timestamps { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelMaxConcurrencyFunction/ParallelMaxConcurrencyFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs new file mode 100644 index 000000000..51b35f19b --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/Function.cs @@ -0,0 +1,61 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("ok1", async (_) => { await Task.CompletedTask; return "first"; }), + new DurableBranch("boom", async (_) => + { + await Task.CompletedTask; + throw new InvalidOperationException("intentional partial failure"); + }), + new DurableBranch("ok2", async (_) => { await Task.CompletedTask; return "third"; }), + }, + name: "partial", + // AllCompleted: drive every branch to terminal state regardless of failure. + // Without this, the default AllSuccessful() would throw on the first failure. + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + var errors = batch.GetErrors(); + var errorSummary = string.Join("|", errors.Select(e => $"{e.GetType().Name}:{e.Message}")); + + return new TestResult + { + Status = "completed", + SuccessCount = batch.SuccessCount, + FailureCount = batch.FailureCount, + ErrorSummary = errorSummary + }; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult +{ + public string? Status { get; set; } + public int SuccessCount { get; set; } + public int FailureCount { get; set; } + public string? ErrorSummary { get; set; } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelPartialFailureFunction/ParallelPartialFailureFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile new file mode 100644 index 000000000..c1913d56a --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Dockerfile @@ -0,0 +1,7 @@ +FROM public.ecr.aws/lambda/provided:al2023 + +RUN dnf install -y libicu + +COPY bin/publish/ ${LAMBDA_TASK_ROOT} + +ENTRYPOINT ["/var/task/bootstrap"] diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs new file mode 100644 index 000000000..195c9b497 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/Function.cs @@ -0,0 +1,57 @@ +using Amazon.Lambda.Core; +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.RuntimeSupport; +using Amazon.Lambda.Serialization.SystemTextJson; + +namespace DurableExecutionTestFunction; + +public class Function +{ + public static async Task Main(string[] args) + { + var handler = new Function(); + var serializer = new DefaultLambdaJsonSerializer(); + using var handlerWrapper = HandlerWrapper.GetHandlerWrapper(handler.Handler, serializer); + using var bootstrap = new LambdaBootstrap(handlerWrapper); + await bootstrap.RunAsync(); + } + + public Task Handler( + DurableExecutionInvocationInput input, ILambdaContext context) + => DurableFunction.WrapAsync(Workflow, input, context); + + private async Task Workflow(TestEvent input, IDurableContext context) + { + // Three branches. Each branch generates a fresh GUID inside a step, + // then does a durable wait. The wait forces a suspend/resume cycle, + // so the second invocation MUST replay the cached GUID rather than + // re-running the step. If replay determinism is broken, the GUID + // would change between the original execution and replay. + var batch = await context.ParallelAsync( + new[] + { + new DurableBranch("a", BranchAsync), + new DurableBranch("b", BranchAsync), + new DurableBranch("c", BranchAsync), + }, + name: "fanout"); + + var joined = string.Join(",", batch.GetResults()); + return new TestResult { Status = "completed", Data = joined }; + } + + private static async Task BranchAsync(IDurableContext ctx) + { + var generatedId = await ctx.StepAsync( + async (_) => { await Task.CompletedTask; return Guid.NewGuid().ToString(); }, + name: "generate"); + + // Force a suspend/resume cycle to trigger replay of the parallel. + await ctx.WaitAsync(TimeSpan.FromSeconds(2), name: "boundary"); + + return generatedId; + } +} + +public class TestEvent { public string? OrderId { get; set; } } +public class TestResult { public string? Status { get; set; } public string? Data { get; set; } } diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj new file mode 100644 index 000000000..6f5f657e4 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/TestFunctions/ParallelReplayDeterminismFunction/ParallelReplayDeterminismFunction.csproj @@ -0,0 +1,18 @@ + + + + net8.0 + Exe + true + bootstrap + enable + enable + + + + + + + + + diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs index effeb5804..d5e91ec37 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CheckpointBatcherTests.cs @@ -172,6 +172,58 @@ public async Task EnqueueAsync_AfterDispose_Throws() await Assert.ThrowsAnyAsync(() => batcher.EnqueueAsync(Update("0-step"))); } + private static SdkOperationUpdate UpdateWithPayload(string id, int payloadBytes) => new() + { + Id = id, + Type = "CONTEXT", + Action = "SUCCEED", + Payload = new string('p', payloadBytes) + }; + + [Fact] + public async Task EnqueueAsync_ByteCap_SplitsBatchesByBytes() + { + var batchByteTotals = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => + { + long sum = 0; + foreach (var o in ops) sum += o.Payload?.Length ?? 0; + batchByteTotals.Add(sum); + return Task.FromResult(token); + }, + new CheckpointBatcherConfig + { + MaxBatchBytes = 10 * 1024, + FlushInterval = TimeSpan.FromMilliseconds(100) + }); + + // Three 6 KB payloads: at most one fits per 10 KB batch with overhead. + var tasks = Enumerable.Range(0, 3) + .Select(i => batcher.EnqueueAsync(UpdateWithPayload($"{i}", 6 * 1024))) + .ToArray(); + await Task.WhenAll(tasks); + await batcher.DrainAsync(); + + Assert.True(batchByteTotals.Count >= 2, "expected the byte cap to split into multiple batches"); + Assert.All(batchByteTotals, total => Assert.True(total <= 10 * 1024)); + } + + [Fact] + public async Task EnqueueAsync_SingleOversizedItem_SentAloneNoLoop() + { + var batches = new List(); + var batcher = new CheckpointBatcher("token-0", + (token, ops, ct) => { batches.Add(ops.Count); return Task.FromResult(token); }, + new CheckpointBatcherConfig { MaxBatchBytes = 4 * 1024 }); + + await batcher.EnqueueAsync(UpdateWithPayload("huge", 50 * 1024)); + await batcher.DrainAsync(); + + Assert.Single(batches); + Assert.Equal(1, batches[0]); + } + [Fact] public async Task CheckpointToken_UpdatesAfterEachFlush() { diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs index 3aa182248..137f83ad8 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs @@ -501,6 +501,94 @@ await Assert.ThrowsAsync(() => name: "phase")); } + [Fact] + public async Task RunInChildContextAsync_ResultOverThreshold_EmitsEmptyPayloadAndReplayChildren() + { + var (context, recorder, _, _) = CreateContext(); + var big = new string('y', 300 * 1024); + + var result = await context.RunInChildContextAsync( + async (_) => { await Task.Yield(); return big; }, + name: "phase"); + + Assert.Equal(big, result); // in-memory value intact for this invoke + + await recorder.Batcher.DrainAsync(); + + var succeed = recorder.Flushed.Single(o => + o.Type == "CONTEXT" && o.Action == "SUCCEED"); + Assert.Equal(string.Empty, succeed.Payload); + Assert.NotNull(succeed.ContextOptions); + Assert.True(succeed.ContextOptions.ReplayChildren); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayChildren_ReExecutesBodyWithoutRecheckpoint() + { + var childOpId = IdAt(1); // first root-level op + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = childOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "phase", + // Result == "" matches the overflow emission (string.Empty). + ContextDetails = new ContextDetails { Result = "", ReplayChildren = true } + } + } + }); + + var executed = false; + var result = await context.RunInChildContextAsync( + async (_) => { executed = true; await Task.Yield(); return "rebuilt"; }, + name: "phase"); + + Assert.True(executed); + Assert.Equal("rebuilt", result); + + await recorder.Batcher.DrainAsync(); + // Already-terminal child must not be re-checkpointed. + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "SUCCEED"); + } + + [Fact] + public async Task RunInChildContextAsync_ReplayChildren_BodyThrows_DoesNotEmitFailCheckpoint() + { + var childOpId = IdAt(1); // first root-level op + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = childOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = "phase", + // Result == "" matches the overflow emission (string.Empty). + ContextDetails = new ContextDetails { Result = "", ReplayChildren = true } + } + } + }); + + // The op is already terminal (SUCCEEDED). If the overflow re-run body + // throws, the recovery path must NOT re-checkpoint a CONTEXT FAIL over + // the already-SUCCEEDED record — but the exception still propagates. + await Assert.ThrowsAsync(() => + context.RunInChildContextAsync( + async (_) => { await Task.Yield(); throw new InvalidOperationException("nondeterministic re-run"); }, + name: "phase")); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => o.Type == "CONTEXT" && o.Action == "FAIL"); + } + [Fact] public async Task RunInChildContextAsync_SubTypeAndName_PropagateToCheckpoint() { diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs index ab649f150..a49b8488e 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/LambdaDurableServiceClientTests.cs @@ -382,6 +382,26 @@ public async Task GetExecutionStateAsync_MapFromSdkOperation_RoundTripsAllErrorF Assert.Equal(new[] { "at Frame.One()", "at Frame.Two()" }, invError.StackTrace!); } + [Fact] + public void MapFromSdkOperation_CopiesReplayChildren() + { + var sdkOp = new Amazon.Lambda.Model.Operation + { + Id = "ctx-1", + Type = "CONTEXT", + Status = "SUCCEEDED", + ContextDetails = new Amazon.Lambda.Model.ContextDetails + { + Result = "{}", + ReplayChildren = true + } + }; + + var mapped = LambdaDurableServiceClient.MapFromSdkOperationForTest(sdkOp); + + Assert.True(mapped.ContextDetails!.ReplayChildren); + } + [Fact] public async Task CheckpointAsync_ReturnsNewToken() { diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs new file mode 100644 index 000000000..0e796e7a0 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/MapOperationTests.cs @@ -0,0 +1,777 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class MapOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ────────────────────────────────────────────────────────────────────── + // Public surface — basic happy paths + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_FreshExecution_AllItemsSucceed() + { + var (context, recorder, tm, _) = CreateContext(); + + var items = new[] { 10, 20, 30 }; + + var result = await context.MapAsync( + items, + async (ctx, item, index, all) => { await Task.Yield(); return item * 2; }, + name: "double_all"); + + Assert.False(tm.IsTerminated); + Assert.Equal(3, result.TotalCount); + Assert.Equal(3, result.SuccessCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(0, result.StartedCount); + Assert.False(result.HasFailure); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 20, 40, 60 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT START + 3 item CONTEXT STARTs + 3 item CONTEXT SUCCEEDs + Parent CONTEXT SUCCEED + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(8, contextActions.Length); + Assert.Equal("Map:START", contextActions[0]); + Assert.Equal("Map:SUCCEED", contextActions[^1]); + } + + [Fact] + public async Task MapAsync_PassesItemIndexAndFullList_ToCallback() + { + var (context, _, _, _) = CreateContext(); + + var items = new[] { "a", "b", "c" }; + + var result = await context.MapAsync( + items, + async (ctx, item, index, all) => + { + await Task.Yield(); + // Confirm the callback sees the item, its index, and the whole list. + Assert.Same(items, all); + Assert.Equal(items[index], item); + return $"{index}:{item}:{all.Count}"; + }); + + Assert.Equal(new[] { "0:a:3", "1:b:3", "2:c:3" }, result.GetResults()); + } + + [Fact] + public async Task MapAsync_PreservesIndexOrder_EvenWhenItemsCompleteOutOfOrder() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 40, 10, 20 }, + async (ctx, delay, index, all) => { await Task.Delay(delay); return index + 1; }); + + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + for (var i = 0; i < result.All.Count; i++) + { + Assert.Equal(i, result.All[i].Index); + } + } + + [Fact] + public async Task MapAsync_ItemOperationIds_AreDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.MapAsync( + new[] { "a", "b" }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstItemId = ChildIdAt(parentOpId, 1); + var secondItemId = ChildIdAt(parentOpId, 2); + + var itemStarts = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "START") + .ToArray(); + Assert.Equal(2, itemStarts.Length); + Assert.Contains(itemStarts, o => o.Id == firstItemId); + Assert.Contains(itemStarts, o => o.Id == secondItemId); + } + + [Fact] + public async Task MapAsync_DefaultNaming_UsesIndexAsName() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + Assert.Equal("0", result.All[0].Name); + Assert.Equal("1", result.All[1].Name); + } + + [Fact] + public async Task MapAsync_ItemNamer_PropagatesNameToCheckpointAndItem() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { "order-1", "order-2" }, + async (ctx, item, index, all) => { await Task.Yield(); return item.Length; }, + name: "process_orders", + config: new MapConfig { ItemNamer = (item, index) => $"Order-{item}" }); + + Assert.Equal("Order-order-1", result.All[0].Name); + Assert.Equal("Order-order-2", result.All[1].Name); + + await recorder.Batcher.DrainAsync(); + + var itemSucceeds = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "SUCCEED") + .ToArray(); + Assert.Contains(itemSucceeds, o => o.Name == "Order-order-1"); + Assert.Contains(itemSucceeds, o => o.Name == "Order-order-2"); + } + + [Fact] + public async Task MapAsync_EmptyCollection_ReturnsEmptyResultWithAllCompleted() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + Array.Empty(), + async (ctx, item, index, all) => { await Task.Yield(); return item; }); + + Assert.Equal(0, result.TotalCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Even the empty case still flushes parent START + parent SUCCEED. + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(new[] { "Map:START", "Map:SUCCEED" }, contextActions); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — Map's permissive default vs fail-fast opt-in + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_AllCompletedDefault_PartialFailureDoesNotThrow() + { + // Map's default CompletionConfig is AllCompleted() (permissive), unlike + // Parallel's AllSuccessful(). A single item failure is captured rather + // than thrown. + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("oops"); + return item; + }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + + var errors = result.GetErrors(); + Assert.Single(errors); + Assert.Contains("oops", errors[0].Message); + } + + [Fact] + public async Task MapAsync_AllSuccessfulOptIn_OneFailureThrowsMapException() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("item boom"); + return item; + }, + config: new MapConfig { CompletionConfig = CompletionConfig.AllSuccessful() })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + Assert.Equal(2, typed.SuccessCount); + } + + [Fact] + public async Task MapAsync_ThrowIfError_ThrowsUnderPermissiveDefault() + { + // The permissive default does not auto-throw; ThrowIfError is the + // explicit strict-success check. + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item == 2) throw new InvalidOperationException("boom"); + return item; + }); + + Assert.True(result.HasFailure); + var thrown = Assert.ThrowsAny(() => result.ThrowIfError()); + Assert.Contains("boom", thrown.Message); + } + + [Fact] + public async Task MapAsync_ToleratedFailureCount_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => + { + await Task.Yield(); + if (item != 3) throw new InvalidOperationException($"fail-{item}"); + return item; + }, + config: new MapConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — first/min-successful short-circuit + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_FirstSuccessful_ResolvesAfterFirstSuccess() + { + var (context, _, _, _) = CreateContext(); + + // MaxConcurrency = 1 so dispatch order is deterministic: item 0 fires + // first and succeeds; items 1 and 2 are never dispatched and remain + // BatchItemStatus.Started. + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig + { + MaxConcurrency = 1, + CompletionConfig = CompletionConfig.FirstSuccessful() + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(1, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(3, result.TotalCount); + + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + } + + // ────────────────────────────────────────────────────────────────────── + // MaxConcurrency + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_MaxConcurrency_LimitsInFlight() + { + var (context, _, _, _) = CreateContext(); + + var inFlight = 0; + var maxObserved = 0; + var lockObj = new object(); + + var result = await context.MapAsync( + new[] { 1, 2, 3, 4, 5 }, + async (ctx, item, index, all) => + { + lock (lockObj) + { + inFlight++; + if (inFlight > maxObserved) maxObserved = inFlight; + } + await Task.Delay(20); + lock (lockObj) inFlight--; + return item; + }, + config: new MapConfig { MaxConcurrency = 2 }); + + Assert.Equal(5, result.SuccessCount); + Assert.True(maxObserved <= 2, $"Observed concurrency {maxObserved} exceeded MaxConcurrency = 2"); + } + + [Fact] + public async Task MapAsync_MaxConcurrencyAtLeastItemCount_RunsWithoutSemaphore() + { + // MaxConcurrency >= item count exercises the no-semaphore optimization + // path; behavior must be identical (all items still run). + var (context, _, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }, + config: new MapConfig { MaxConcurrency = 10 }); + + Assert.Equal(3, result.SuccessCount); + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + } + + [Fact] + public void MapConfig_MaxConcurrency_OutOfRange_Throws() + { + var config = new MapConfig(); + Assert.Throws(() => config.MaxConcurrency = 0); + Assert.Throws(() => config.MaxConcurrency = -1); + config.MaxConcurrency = 1; + config.MaxConcurrency = null; + } + + [Fact] + public void MapConfig_DefaultCompletionConfig_IsAllCompleted() + { + // Guards the intentional divergence from ParallelConfig (AllSuccessful). + var config = new MapConfig(); + // AllCompleted() == empty CompletionConfig (no failure thresholds). + Assert.Null(config.CompletionConfig.ToleratedFailureCount); + Assert.Null(config.CompletionConfig.MinSuccessful); + Assert.Null(config.CompletionConfig.ToleratedFailurePercentage); + } + + // ────────────────────────────────────────────────────────────────────── + // NestingType + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_NestingTypeFlat_SuppressesPerItemContextOps() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item * 10; }, + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Parent Map CONTEXT ops still emitted; no per-item CONTEXT ops under Flat. + var parentActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "Map") + .Select(o => $"{o.Action}").ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, parentActions); + + Assert.Empty(recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "MapItem")); + } + + [Fact] + public async Task MapAsync_NestingTypeFlat_InnerOpsReparentToMapOp() + { + var (context, recorder, _, _) = CreateContext(); + + await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => + await ctx.StepAsync(async (_) => { await Task.Yield(); return item * 10; }), + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var item0Id = ChildIdAt(parentOpId, 1); + var item1Id = ChildIdAt(parentOpId, 2); + var step0Id = ChildIdAt(item0Id, 1); + var step1Id = ChildIdAt(item1Id, 1); + + // A step emits both START and SUCCEED under the same Id; scope to START + // so we assert on exactly one record per step. + var steps = recorder.Flushed + .Where(o => o.Type == "STEP" && $"{o.Action}" == "START").ToArray(); + var step0 = Assert.Single(steps, o => o.Id == step0Id); + var step1 = Assert.Single(steps, o => o.Id == step1Id); + + // Inner steps re-parent to the MAP op (nearest non-virtual ancestor). + Assert.Equal(parentOpId, step0.ParentId); + Assert.Equal(parentOpId, step1.ParentId); + } + + [Fact] + public async Task MapAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() + { + var parentOpId = IdAt(1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","Result":"10"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"20"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "doubler", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var executed = false; + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { executed = true; await Task.Yield(); return item * 999; }, + name: "doubler", + config: new MapConfig { NestingType = NestingType.Flat }); + + Assert.False(executed); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + // ────────────────────────────────────────────────────────────────────── + // Argument validation + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_NullItems_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync( + null!, + async (ctx, item, index, all) => { await Task.Yield(); return item; })); + } + + [Fact] + public async Task MapAsync_NullFunc_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.MapAsync(new[] { 1 }, (Func, Task>)null!)); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + var i1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "double_all", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails { Result = "100" } + }, + new() + { + Id = i1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "1", + ContextDetails = new ContextDetails { Result = "200" } + } + } + }); + + var calls = 0; + var result = await context.MapAsync( + new[] { 1, 2 }, + async (ctx, item, index, all) => { calls++; await Task.Yield(); return 999; }, + name: "double_all"); + + // Cached results returned without re-executing the callback. + Assert.Equal(0, calls); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task MapAsync_ReplayMixedStatus_PreservesStartedShortCircuited() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + var i1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails { Result = "10" } + }, + new() + { + Id = i1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "1", + ContextDetails = new ContextDetails { Result = "20" } + } + // Item 2 has no checkpoint at all — it was never dispatched. + } + }); + + var calls = 0; + var result = await context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { calls++; await Task.Yield(); return 999; }, + name: "m"); + + Assert.Equal(0, calls); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.StartedCount); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task MapAsync_ReplayFailed_RebuildsResultAndThrows() + { + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.MapItem, + Name = "0", + ContextDetails = new ContextDetails + { + Error = new ErrorObject { ErrorMessage = "stored failure", ErrorType = "System.InvalidOperationException" } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return 999; }, + name: "m")); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + } + + [Fact] + public async Task MapAsync_ReplayWithDriftedItemName_ThrowsNonDeterministic() + { + // A checkpointed item name that differs from the current ItemNamer output + // indicates the item set was reordered/renamed between deployments. + var parentOpId = IdAt(1); + var i0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Map, + Name = "m", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = i0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.MapItem, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.MapAsync( + new[] { 1 }, + async (ctx, item, index, all) => { await Task.Yield(); return 999; }, + name: "m", + // Namer now yields "renamed" instead of the checkpointed "alpha". + config: new MapConfig { ItemNamer = (item, index) => "renamed" })); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay determinism + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task MapAsync_TwoFreshRuns_ProduceIdenticalItemOperationIds() + { + // Item operation IDs are derived from the parent op ID + index, so two + // independent fresh runs of the same workflow shape must emit the same + // child IDs (the foundation of replay correctness). + string[] IdsFromRun() + { + var (context, recorder, _, _) = CreateContext(); + context.MapAsync( + new[] { 1, 2, 3 }, + async (ctx, item, index, all) => { await Task.Yield(); return item; }).GetAwaiter().GetResult(); + recorder.Batcher.DrainAsync().GetAwaiter().GetResult(); + return recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "MapItem" && o.Action == "START") + .Select(o => o.Id) + .OrderBy(id => id) + .ToArray(); + } + + var first = IdsFromRun(); + var second = IdsFromRun(); + + Assert.Equal(3, first.Length); + Assert.Equal(first, second); + } +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs new file mode 100644 index 000000000..d2b902a3e --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ParallelOperationTests.cs @@ -0,0 +1,1576 @@ +using Amazon.Lambda.DurableExecution; +using Amazon.Lambda.DurableExecution.Internal; +using Amazon.Lambda.Serialization.SystemTextJson; +using Amazon.Lambda.TestUtilities; +using Xunit; + +namespace Amazon.Lambda.DurableExecution.Tests; + +public class ParallelOperationTests +{ + /// Reproduces the Id that emits for the n-th root-level operation. + private static string IdAt(int position) => OperationIdGenerator.HashOperationId(position.ToString()); + + /// The hashed ID of the n-th child operation under . + private static string ChildIdAt(string parentOpId, int position) => + OperationIdGenerator.HashOperationId($"{parentOpId}-{position}"); + + private static (DurableContext context, RecordingBatcher recorder, TerminationManager tm, ExecutionState state) + CreateContext(InitialExecutionState? initialState = null) + { + var state = new ExecutionState(); + state.LoadFromCheckpoint(initialState); + var tm = new TerminationManager(); + var idGen = new OperationIdGenerator(); +#pragma warning disable AWSLAMBDA001 // TestLambdaContext.Serializer is experimental. + var lambdaContext = new TestLambdaContext { Serializer = new DefaultLambdaJsonSerializer() }; +#pragma warning restore AWSLAMBDA001 + var recorder = new RecordingBatcher(); + var context = new DurableContext(state, tm, idGen, "arn:test", lambdaContext, recorder.Batcher); + return (context, recorder, tm, state); + } + + // ────────────────────────────────────────────────────────────────────── + // Public surface — basic happy paths + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_FreshExecution_AllBranchesSucceed() + { + var (context, recorder, tm, _) = CreateContext(); + + var branches = new Func>[] + { + async (ctx) => { await Task.Yield(); return 10; }, + async (ctx) => { await Task.Yield(); return 20; }, + async (ctx) => { await Task.Yield(); return 30; }, + }; + + var result = await context.ParallelAsync(branches, name: "fanout"); + + Assert.False(tm.IsTerminated); + Assert.Equal(3, result.TotalCount); + Assert.Equal(3, result.SuccessCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(0, result.StartedCount); + Assert.False(result.HasFailure); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Parent CONTEXT START + 3 child CONTEXT STARTs + 3 child CONTEXT SUCCEEDs + Parent CONTEXT SUCCEED + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(8, contextActions.Length); + Assert.Equal("Parallel:START", contextActions[0]); + Assert.Equal("Parallel:SUCCEED", contextActions[^1]); + } + + [Fact] + public async Task ParallelAsync_PreservesIndexOrder_EvenWhenBranchesCompleteOutOfOrder() + { + var (context, _, _, _) = CreateContext(); + + var branches = new Func>[] + { + async (ctx) => { await Task.Delay(40); return 1; }, + async (ctx) => { await Task.Delay(10); return 2; }, + async (ctx) => { await Task.Delay(20); return 3; }, + }; + + var result = await context.ParallelAsync(branches); + + Assert.Equal(new[] { 1, 2, 3 }, result.GetResults()); + for (var i = 0; i < result.All.Count; i++) + { + Assert.Equal(i, result.All[i].Index); + } + } + + [Fact] + public async Task ParallelAsync_BranchOperationIds_AreDeterministic() + { + var (context, recorder, _, _) = CreateContext(); + + await context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return "a"; }, + async (_) => { await Task.Yield(); return "b"; }, + }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var firstBranchId = ChildIdAt(parentOpId, 1); + var secondBranchId = ChildIdAt(parentOpId, 2); + + // Each branch's CONTEXT START should hit the deterministic child ID. + var branchStarts = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "START") + .ToArray(); + Assert.Equal(2, branchStarts.Length); + Assert.Contains(branchStarts, o => o.Id == firstBranchId); + Assert.Contains(branchStarts, o => o.Id == secondBranchId); + } + + [Fact] + public async Task ParallelAsync_NamedBranches_PropagateNameToCheckpointAndItem() + { + var (context, recorder, _, _) = CreateContext(); + + var branches = new[] + { + new DurableBranch("alpha", async (_) => { await Task.Yield(); return 1; }), + new DurableBranch("beta", async (_) => { await Task.Yield(); return 2; }), + }; + + var result = await context.ParallelAsync(branches, name: "fanout"); + + Assert.Equal("alpha", result.All[0].Name); + Assert.Equal("beta", result.All[1].Name); + + await recorder.Batcher.DrainAsync(); + + var branchSucceeds = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "SUCCEED") + .ToArray(); + Assert.Contains(branchSucceeds, o => o.Name == "alpha"); + Assert.Contains(branchSucceeds, o => o.Name == "beta"); + } + + [Fact] + public async Task ParallelAsync_UnnamedOverload_DefaultsToIndexAsName() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }); + + Assert.Equal("0", result.All[0].Name); + Assert.Equal("1", result.All[1].Name); + } + + [Fact] + public async Task ParallelAsync_EmptyBranches_ReturnsEmptyResultWithAllCompleted() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync(Array.Empty>>()); + + Assert.Equal(0, result.TotalCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Even the empty case still flushes parent START + parent SUCCEED. + var contextActions = recorder.Flushed.Where(o => o.Type == "CONTEXT") + .Select(o => $"{o.SubType}:{o.Action}").ToArray(); + Assert.Equal(new[] { "Parallel:START", "Parallel:SUCCEED" }, contextActions); + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — failure tolerance + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_AllSuccessfulDefault_OneFailureThrowsParallelException() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync(new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("branch boom"); }, + async (_) => { await Task.Yield(); return 3; }, + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + var typed = Assert.IsAssignableFrom>(ex.Result); + Assert.Equal(1, typed.FailureCount); + Assert.Equal(2, typed.SuccessCount); + } + + [Fact] + public async Task ParallelAsync_AllCompleted_PartialFailureDoesNotThrow() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("oops"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + + var errors = result.GetErrors(); + Assert.Single(errors); + Assert.Contains("oops", errors[0].Message); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailureCount_AllowsUpToThreshold() + { + var (context, _, _, _) = CreateContext(); + + // 4 branches, 2 fail; tolerated = 2 (>= failures), so resolves without + // throwing. + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-1"); }, + async (_) => { await Task.Yield(); return 3; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-2"); }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 2 } + }); + + Assert.Equal(2, result.FailureCount); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailureCount_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-1"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("fail-2"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + [Fact] + public async Task ParallelAsync_ToleratedFailurePercentage_ExceededThrows() + { + var (context, _, _, _) = CreateContext(); + + // 4 branches, 3 fail (75%) > 0.5 (50%) → exceeded. + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("f1"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("f2"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("f3"); }, + async (_) => { await Task.Yield(); return 4; }, + }, + config: new ParallelConfig + { + CompletionConfig = new CompletionConfig { ToleratedFailurePercentage = 0.5 } + })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + } + + [Fact] + public void CompletionConfig_ToleratedFailurePercentage_OutOfRange_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.ToleratedFailurePercentage = 1.5); + Assert.Throws(() => config.ToleratedFailurePercentage = -0.1); + // boundary values are accepted + config.ToleratedFailurePercentage = 0.0; + config.ToleratedFailurePercentage = 1.0; + config.ToleratedFailurePercentage = null; + } + + [Fact] + public void CompletionConfig_MinSuccessful_OutOfRange_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.MinSuccessful = 0); + Assert.Throws(() => config.MinSuccessful = -1); + // 1 is the minimum meaningful value; null clears the criterion. + config.MinSuccessful = 1; + config.MinSuccessful = null; + } + + [Fact] + public void CompletionConfig_ToleratedFailureCount_Negative_Throws() + { + var config = new CompletionConfig(); + Assert.Throws(() => config.ToleratedFailureCount = -1); + // zero (fail-fast) and positive counts are valid; null clears the criterion. + config.ToleratedFailureCount = 0; + config.ToleratedFailureCount = 5; + config.ToleratedFailureCount = null; + } + + // ────────────────────────────────────────────────────────────────────── + // CompletionConfig — first-successful short-circuit + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_FirstSuccessful_ResolvesAfterFirstSuccess() + { + var (context, _, _, _) = CreateContext(); + + // MaxConcurrency = 1 so we know the dispatch order is deterministic: + // branch 0 fires first and succeeds; branches 1 and 2 are never + // dispatched at all, so they remain in BatchItemStatus.Started. + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + }, + config: new ParallelConfig + { + MaxConcurrency = 1, + CompletionConfig = CompletionConfig.FirstSuccessful() + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(1, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + Assert.Equal(0, result.FailureCount); + Assert.Equal(3, result.TotalCount); + + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + } + + [Fact] + public async Task ParallelAsync_MinSuccessful_ResolvesWhenTargetReached() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + async (_) => { await Task.Yield(); return 4; }, + }, + config: new ParallelConfig + { + MaxConcurrency = 1, + CompletionConfig = new CompletionConfig { MinSuccessful = 2 } + }); + + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(2, result.StartedCount); + } + + // ────────────────────────────────────────────────────────────────────── + // MaxConcurrency + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_MaxConcurrency_LimitsInFlight() + { + var (context, _, _, _) = CreateContext(); + + var inFlight = 0; + var maxObserved = 0; + var lockObj = new object(); + + var branches = new Func>[] + { + MakeBranch(), + MakeBranch(), + MakeBranch(), + MakeBranch(), + MakeBranch(), + }; + + var result = await context.ParallelAsync(branches, config: new ParallelConfig { MaxConcurrency = 2 }); + + Assert.Equal(5, result.SuccessCount); + Assert.True(maxObserved <= 2, $"Observed concurrency {maxObserved} exceeded MaxConcurrency = 2"); + + Func> MakeBranch() + { + return async (_) => + { + lock (lockObj) + { + inFlight++; + if (inFlight > maxObserved) maxObserved = inFlight; + } + await Task.Delay(20); + lock (lockObj) inFlight--; + return 1; + }; + } + } + + [Fact] + public void ParallelConfig_MaxConcurrency_OutOfRange_Throws() + { + var config = new ParallelConfig(); + Assert.Throws(() => config.MaxConcurrency = 0); + Assert.Throws(() => config.MaxConcurrency = -1); + config.MaxConcurrency = 1; + config.MaxConcurrency = null; + } + + // ────────────────────────────────────────────────────────────────────── + // NestingType + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_SuppressesPerBranchContextOps() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 10; }, + async (_) => { await Task.Yield(); return 20; }, + async (_) => { await Task.Yield(); return 30; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + Assert.Equal(new[] { 10, 20, 30 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // Parent Parallel CONTEXT ops are still emitted (the parent is never + // virtual)... + var parentActions = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "Parallel") + .Select(o => $"{o.Action}").ToArray(); + Assert.Equal(new[] { "START", "SUCCEED" }, parentActions); + + // ...but NO per-branch CONTEXT ops are emitted under Flat. + var branchOps = recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch") + .ToArray(); + Assert.Empty(branchOps); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_InnerOpsReparentToParallelOp() + { + var (context, recorder, _, _) = CreateContext(); + + await context.ParallelAsync( + new Func>[] + { + async (ctx) => await ctx.StepAsync(async (_) => { await Task.Yield(); return 1; }), + async (ctx) => await ctx.StepAsync(async (_) => { await Task.Yield(); return 2; }), + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + await recorder.Batcher.DrainAsync(); + + var parentOpId = IdAt(1); + var branch0Id = ChildIdAt(parentOpId, 1); + var branch1Id = ChildIdAt(parentOpId, 2); + + // Each branch's inner STEP is ID-derived from the branch op id (so the + // two branches' first steps don't collide)... + var step0Id = ChildIdAt(branch0Id, 1); + var step1Id = ChildIdAt(branch1Id, 1); + + // A step emits both START and SUCCEED under the same Id; scope to START + // so we assert on exactly one record per step. + var steps = recorder.Flushed + .Where(o => o.Type == "STEP" && $"{o.Action}" == "START").ToArray(); + var step0 = Assert.Single(steps, o => o.Id == step0Id); + var step1 = Assert.Single(steps, o => o.Id == step1Id); + + // ...but each inner step re-parents to the PARALLEL op (the nearest + // non-virtual ancestor), NOT to the virtual branch (which emitted no + // checkpoint to reference). + Assert.Equal(parentOpId, step0.ParentId); + Assert.Equal(parentOpId, step1.ParentId); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_PartialFailure_SurfacesInlineErrors() + { + var (context, recorder, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("flat boom"); }, + async (_) => { await Task.Yield(); return 3; }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = CompletionConfig.AllCompleted() + }); + + Assert.True(result.HasFailure); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.FailureCount); + Assert.Equal(new[] { 1, 3 }, result.GetResults()); + Assert.Contains("flat boom", result.GetErrors()[0].Message); + + await recorder.Batcher.DrainAsync(); + + // The parent SUCCEED payload carries the inline per-unit results/errors; + // no per-branch FAIL op was emitted. + Assert.Empty(recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "ParallelBranch")); + } + + [Fact] + public async Task ParallelAsync_Flat_ResultOverThreshold_StripsInlineResultsAndSetsReplayChildren() + { + var (context, recorder, _, _) = CreateContext(); + + // Each branch returns a ~200 KB string; the summary with both inline + // exceeds the 256 KB checkpoint threshold. + var big = new string('x', 200 * 1024); + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return big; }, + async (_) => { await Task.Yield(); return big; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + // In-memory result for the current invoke still carries the full values. + Assert.Equal(2, result.SuccessCount); + Assert.All(result.GetResults(), r => Assert.Equal(big, r)); + + await recorder.Batcher.DrainAsync(); + + var parentSucceed = recorder.Flushed.Single(o => + o.Type == "CONTEXT" && o.SubType == "Parallel" && o.Action == "SUCCEED"); + + // Overflow: ReplayChildren flag set, payload stripped under the threshold. + Assert.NotNull(parentSucceed.ContextOptions); + Assert.True(parentSucceed.ContextOptions.ReplayChildren); + Assert.True(System.Text.Encoding.UTF8.GetByteCount(parentSucceed.Payload) + <= Amazon.Lambda.DurableExecution.Internal.DurableConstants.MaxOperationCheckpointBytes); + // Stripped summary keeps statuses but not the big inline results. + Assert.DoesNotContain(big, parentSucceed.Payload); + Assert.Contains("SUCCEEDED", parentSucceed.Payload); + } + + [Fact] + public async Task ParallelAsync_Flat_ReplayChildren_ReExecutesBodiesWithoutRecheckpointing() + { + var parentOpId = IdAt(1); + + // Stripped summary: statuses present, NO inline Result values. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails + { + Result = summaryJson, + ReplayChildren = true + } + } + } + }); + + var executions = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 100; }, + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 200; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + // Bodies re-executed (values recovered), statuses/reason from frozen summary. + Assert.Equal(2, executions); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + + // The parent is already terminal in state — replay must NOT re-emit a + // parent CONTEXT SUCCEED/FAIL. + Assert.DoesNotContain(recorder.Flushed, o => + o.Type == "CONTEXT" && o.SubType == "Parallel"); + } + + [Fact] + public async Task ParallelAsync_Flat_ReplayChildren_SkipsStartedUnits_ReExecutesCompletedOnly() + { + var parentOpId = IdAt(1); + + // Stripped summary: two units short-circuited the run with MinSuccessful=2 + // (SUCCEEDED, SUCCEEDED), the third was never dispatched (STARTED). On + // overflow replay only the two completed units re-execute; the started + // unit's body must NOT run. + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails + { + Result = summaryJson, + ReplayChildren = true + } + } + } + }); + + var executions = 0; + var startedBodyRan = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 100; }, + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 200; }, + async (_) => { startedBodyRan = true; Interlocked.Increment(ref executions); await Task.Yield(); return 300; }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = new CompletionConfig { MinSuccessful = 2 } + }); + + // Only the two SUCCEEDED unit bodies re-execute; the STARTED unit is skipped. + Assert.Equal(2, executions); + Assert.False(startedBodyRan); + + // Per-item statuses come from the frozen summary. + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + + // Recovered values for the two succeeded units. + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => + o.Type == "CONTEXT" && o.SubType == "Parallel"); + } + + [Fact] + public async Task ParallelAsync_Flat_ReplayChildren_ReExecutesFailedUnit_RecoversError() + { + var parentOpId = IdAt(1); + + // Stripped summary: one SUCCEEDED, one FAILED. Errors were stripped on + // overflow, so re-execution recovers them. Tolerated-failure config keeps + // the run from throwing. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"FAILED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails + { + Result = summaryJson, + ReplayChildren = true + } + } + } + }); + + var executions = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); return 100; }, + async (_) => { Interlocked.Increment(ref executions); await Task.Yield(); throw new InvalidOperationException("flat boom"); }, + }, + name: "fanout", + config: new ParallelConfig + { + NestingType = NestingType.Flat, + CompletionConfig = new CompletionConfig { ToleratedFailureCount = 1 } + }); + + // Both bodies re-execute to recover the value and the error. + Assert.Equal(2, executions); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Failed, result.All[1].Status); + Assert.Equal(100, result.All[0].Result); + Assert.NotNull(result.All[1].Error); + Assert.Contains("flat boom", result.All[1].Error!.Message); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.DoesNotContain(recorder.Flushed, o => + o.Type == "CONTEXT" && o.SubType == "Parallel"); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ReplaySucceeded_RebuildsFromInlinePayload() + { + var parentOpId = IdAt(1); + + // Flat replay reads per-unit results from the inline summary payload — + // there are NO per-branch child CONTEXT ops in state. + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","Result":"100"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"200"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var executed = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { executed = true; await Task.Yield(); return 999; }, + async (_) => { executed = true; await Task.Yield(); return 999; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat }); + + Assert.False(executed); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_NestingTypeFlat_ReplayFailed_ThrowsWithInlineError() + { + var parentOpId = IdAt(1); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED","Error":{"ErrorType":"System.InvalidOperationException","ErrorMessage":"flat branch 0 failed"}}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","Result":"200"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }, + name: "fanout", + config: new ParallelConfig { NestingType = NestingType.Flat })); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + var typed = (IBatchResult)ex.Result!; + Assert.Equal(1, typed.FailureCount); + Assert.Contains("flat branch 0 failed", typed.GetErrors()[0].Message); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_ReplaySucceeded_RebuildsResultFromCheckpoints() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED","OperationId":"placeholder0"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED","OperationId":"placeholder1"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "100" } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails { Result = "200" } + } + } + }); + + var executed = false; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { executed = true; await Task.Yield(); return 999; }, + async (_) => { executed = true; await Task.Yield(); return 999; }, + }, + name: "fanout"); + + Assert.False(executed); + Assert.Equal(new[] { 100, 200 }, result.GetResults()); + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_ReplayFailed_ThrowsParallelException() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"FAILURE_TOLERANCE_EXCEEDED","Units":[ + {"Index":0,"Name":"0","Status":"FAILED","OperationId":"placeholder0"}, + {"Index":1,"Name":"1","Status":"FAILED","OperationId":"placeholder1"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "branch 0 failed" + } + } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Failed, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails + { + Error = new ErrorObject + { + ErrorType = "System.InvalidOperationException", + ErrorMessage = "branch 1 failed" + } + } + } + } + }); + + var ex = await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + }, + name: "fanout")); + + Assert.Equal(CompletionReason.FailureToleranceExceeded, ex.CompletionReason); + Assert.NotNull(ex.Result); + + var typed = (IBatchResult)ex.Result!; + Assert.Equal(2, typed.FailureCount); + Assert.Contains("branch 0 failed", typed.GetErrors()[0].Message); + } + + [Fact] + public async Task ParallelAsync_ReplayStarted_ReExecutesBranches() + { + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Started, + SubType = OperationSubTypes.Parallel, + Name = "fanout" + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "11" } + } + } + }); + + var calls = new int[2]; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { calls[0]++; await Task.Yield(); return 99; }, + async (_) => { calls[1]++; await Task.Yield(); return 22; }, + }, + name: "fanout"); + + // Branch 0 replays cached value (not re-executed); branch 1 runs fresh. + Assert.Equal(0, calls[0]); + Assert.Equal(1, calls[1]); + Assert.Equal(new[] { 11, 22 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + + // Critical: do NOT re-checkpoint parent CONTEXT START (the original + // STARTED record is still authoritative). + var parentStarts = recorder.Flushed.Where(o => + o.Type == "CONTEXT" && o.SubType == "Parallel" && o.Action == "START").ToArray(); + Assert.Empty(parentStarts); + } + + [Fact] + public async Task ParallelAsync_ReplayUnknownStatus_ThrowsNonDeterministic() + { + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = IdAt(1), + Type = OperationTypes.Context, + Status = "BOGUS", + SubType = OperationSubTypes.Parallel, + Name = "fanout" + } + } + }); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new Func>[] { async (_) => { await Task.Yield(); return 1; } }, + name: "fanout")); + } + + // ────────────────────────────────────────────────────────────────────── + // IBatchResult helpers + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task BatchResult_ThrowIfError_ThrowsFirstError() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("kaboom"); }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + var ex = Assert.Throws(() => result.ThrowIfError()); + Assert.Contains("kaboom", ex.Message); + } + + [Fact] + public async Task BatchResult_GetResults_SkipsFailedAndStartedItems() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 10; }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("ouch"); }, + async (_) => { await Task.Yield(); return 30; }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.Equal(new[] { 10, 30 }, result.GetResults()); + } + + [Fact] + public async Task BatchResult_AllSucceededFailedStarted_AreInOriginalIndexOrder() + { + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, // index 0 succeed + async (_) => { await Task.Yield(); throw new InvalidOperationException("bad-1"); }, // index 1 fail + async (_) => { await Task.Yield(); return 3; }, // index 2 succeed + async (_) => { await Task.Yield(); throw new InvalidOperationException("bad-3"); }, // index 3 fail + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.AllCompleted() }); + + Assert.Equal(new[] { 0, 2 }, result.Succeeded.Select(i => i.Index).ToArray()); + Assert.Equal(new[] { 1, 3 }, result.Failed.Select(i => i.Index).ToArray()); + Assert.Empty(result.Started); + } + + // ────────────────────────────────────────────────────────────────────── + // Argument validation + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_NullBranches_Throws() + { + var (context, _, _, _) = CreateContext(); + + await Assert.ThrowsAsync(() => + context.ParallelAsync((IReadOnlyList>>)null!)); + } + + [Fact] + public async Task ParallelAsync_NullBranchInList_Throws() + { + var (context, _, _, _) = CreateContext(); + + var branches = new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + null!, + }; + + await Assert.ThrowsAsync(() => context.ParallelAsync(branches)); + } + + // ────────────────────────────────────────────────────────────────────── + // Concurrency / cancellation regressions (Critical 1, Critical 2) + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_CancelMidDispatch_AllBranchesSettleAndNoObjectDisposed() + { + // Regression for orphan-branch bug: dispatch 5 branches with + // MaxConcurrency=2; cancel parent CancellationToken right after the + // first batch starts so the dispatcher's semaphore.WaitAsync trips + // OperationCanceledException mid-loop. With the old code branches in + // flight at cancellation time would Release on a disposed semaphore + // and fault as ObjectDisposedException. With the fix the semaphore + // dispose is gated on Task.WhenAll over inFlight, so every dispatched + // task settles cleanly first. + var (context, _, _, _) = CreateContext(); + + using var cts = new CancellationTokenSource(); + var dispatchedReady = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + var dispatchedCount = 0; + var lockObj = new object(); + var capturedExceptions = new List(); + var unobservedCount = 0; + + EventHandler handler = (_, args) => + { + lock (lockObj) + { + Interlocked.Increment(ref unobservedCount); + capturedExceptions.Add(args.Exception); + } + }; + TaskScheduler.UnobservedTaskException += handler; + + try + { + var branches = new Func>[5]; + for (var i = 0; i < 5; i++) + { + branches[i] = async (_) => + { + int n; + lock (lockObj) n = ++dispatchedCount; + if (n == 2) dispatchedReady.TrySetResult(); + // Hold the branch long enough that cancellation arrives + // while we're in flight. + try { await Task.Delay(200, cts.Token).ConfigureAwait(false); } + catch (OperationCanceledException) { /* cooperatively stop */ } + return n; + }; + } + + var run = context.ParallelAsync( + branches, + config: new ParallelConfig + { + MaxConcurrency = 2, + CompletionConfig = CompletionConfig.AllCompleted() + }, + cancellationToken: cts.Token); + + // Wait until 2 branches are running, then cancel — this trips + // the dispatcher on its next semaphore.WaitAsync call. + await dispatchedReady.Task.WaitAsync(TimeSpan.FromSeconds(5)); + cts.Cancel(); + + // The orchestrator should surface OperationCanceledException + // cleanly (NOT ObjectDisposedException) once the in-flight + // branches settle. + var ex = await Assert.ThrowsAnyAsync(() => run); + Assert.IsNotType(ex); + + // Force GC + finalizers so any unobserved exceptions surface. + GC.Collect(); + GC.WaitForPendingFinalizers(); + GC.Collect(); + + Assert.Equal(0, Volatile.Read(ref unobservedCount)); + foreach (var captured in capturedExceptions) + { + Assert.IsNotType(captured); + } + } + finally + { + TaskScheduler.UnobservedTaskException -= handler; + } + } + + [Fact] + public void ExecutionState_ConcurrentTrackReplayAndValidate_NoExceptionsAndConsistent() + { + // Regression for ExecutionState race: 16 tasks call TrackReplay / + // ValidateReplayConsistency / GetOperation concurrently. With the + // unguarded Dictionary/HashSet collections this would either throw + // InvalidOperationException (concurrent enumeration) or produce + // torn reads. Under the lock the ops are serialized and consistent. + var state = new ExecutionState(); + var ops = new List(); + var ids = new List(); + for (var i = 0; i < 50; i++) + { + var id = $"op-{i}"; + ids.Add(id); + ops.Add(new Operation + { + Id = id, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + Name = $"name-{i}" + }); + } + state.LoadFromCheckpoint(new InitialExecutionState { Operations = ops }); + + var caught = new List(); + var caughtLock = new object(); + var tasks = new Task[16]; + for (var t = 0; t < 16; t++) + { + var seed = t; + tasks[t] = Task.Run(() => + { + try + { + var rng = new Random(seed); + for (var iter = 0; iter < 200; iter++) + { + var id = ids[rng.Next(ids.Count)]; + state.TrackReplay(id); + state.ValidateReplayConsistency(id, OperationTypes.Context, $"name-{id.Substring(3)}"); + _ = state.GetOperation(id); + _ = state.HasOperation(id); + _ = state.IsReplaying; + } + } + catch (Exception ex) + { + lock (caughtLock) caught.Add(ex); + } + }); + } + + Task.WaitAll(tasks, TimeSpan.FromSeconds(30)); + Assert.Empty(caught); + + // Once every terminal op has been visited, IsReplaying must be false. + Assert.False(state.IsReplaying); + } + + // ────────────────────────────────────────────────────────────────────── + // Replay determinism / failure modes / mixed-status replay + // ────────────────────────────────────────────────────────────────────── + + [Fact] + public async Task ParallelAsync_ReplayDeterminism_SameWorkflowProducesSameBranchIds() + { + // Run the same workflow shape twice from scratch and assert the + // branch CONTEXT START IDs are byte-identical. This pins the + // determinism contract: the n-th branch's hashed ID is a pure + // function of (root counter position, branch index). + async Task RunOnce() + { + var (context, recorder, _, _) = CreateContext(); + await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); return 1; }, + async (_) => { await Task.Yield(); return 2; }, + async (_) => { await Task.Yield(); return 3; }, + }, + name: "fanout"); + await recorder.Batcher.DrainAsync(); + return recorder.Flushed + .Where(o => o.Type == "CONTEXT" && o.SubType == "ParallelBranch" && o.Action == "START") + .Select(o => o.Id!) + .OrderBy(s => s) + .ToArray(); + } + + var run1Ids = await RunOnce(); + var run2Ids = await RunOnce(); + + Assert.Equal(3, run1Ids.Length); + Assert.Equal(run1Ids, run2Ids); + } + + [Fact] + public async Task ParallelAsync_FirstSuccessful_AllFail_AggregatesAsParallelException() + { + // FirstSuccessful() aliases MinSuccessful=1 with no explicit failure + // tolerance. When every branch fails, MinSuccessful is unreachable + // AND there is no failure-tolerance threshold, so the run completes + // as AllCompleted with HasFailure=true. Calling ThrowIfError surfaces + // the first failure; without explicit failure tolerance the parallel + // does NOT throw on its own (matches Python). + var (context, _, _, _) = CreateContext(); + + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { await Task.Yield(); throw new InvalidOperationException("a"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("b"); }, + async (_) => { await Task.Yield(); throw new InvalidOperationException("c"); }, + }, + config: new ParallelConfig { CompletionConfig = CompletionConfig.FirstSuccessful() }); + + Assert.Equal(CompletionReason.AllCompleted, result.CompletionReason); + Assert.Equal(0, result.SuccessCount); + Assert.Equal(3, result.FailureCount); + Assert.True(result.HasFailure); + + // Caller-driven aggregation: ThrowIfError surfaces the first failure. + var ex = Assert.Throws(() => result.ThrowIfError()); + Assert.Contains("a", ex.Message); + } + + [Fact] + public async Task ParallelAsync_ReplayMixedStatus_PreservesStartedShortCircuited() + { + // Parent SUCCEEDED with MinSuccessful short-circuit: branch 0 + // SUCCEEDED, branch 1 SUCCEEDED, branch 2 was never dispatched + // (still STARTED in the summary). Replay must reproduce the original + // BatchResult shape — including the un-dispatched STARTED entry — + // without re-executing any branch. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + var b1 = ChildIdAt(parentOpId, 2); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"0","Status":"SUCCEEDED"}, + {"Index":1,"Name":"1","Status":"SUCCEEDED"}, + {"Index":2,"Name":"2","Status":"STARTED"} + ]} + """; + + var (context, recorder, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "0", + ContextDetails = new ContextDetails { Result = "10" } + }, + new() + { + Id = b1, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "1", + ContextDetails = new ContextDetails { Result = "20" } + } + // Branch 2 has no checkpoint at all — it was never dispatched. + } + }); + + var calls = 0; + var result = await context.ParallelAsync( + new Func>[] + { + async (_) => { calls++; await Task.Yield(); return 999; }, + async (_) => { calls++; await Task.Yield(); return 999; }, + async (_) => { calls++; await Task.Yield(); return 999; }, + }, + name: "fanout"); + + Assert.Equal(0, calls); + Assert.Equal(CompletionReason.MinSuccessfulReached, result.CompletionReason); + Assert.Equal(2, result.SuccessCount); + Assert.Equal(1, result.StartedCount); + Assert.Equal(BatchItemStatus.Succeeded, result.All[0].Status); + Assert.Equal(BatchItemStatus.Succeeded, result.All[1].Status); + Assert.Equal(BatchItemStatus.Started, result.All[2].Status); + Assert.Equal(new[] { 10, 20 }, result.GetResults()); + + await recorder.Batcher.DrainAsync(); + Assert.Empty(recorder.Flushed); + } + + [Fact] + public async Task ParallelAsync_ReplayUsesCheckpointedBranchName_NotCurrentName() + { + // The checkpointed name is authoritative on replay. Even when a branch + // has no per-branch checkpoint (STARTED / never dispatched), the name + // from the parent summary must flow through to the reconstructed item. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"MIN_SUCCESSFUL_REACHED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"}, + {"Index":1,"Name":"beta","Status":"STARTED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + var result = await context.ParallelAsync( + new[] + { + new DurableBranch("alpha", async (_) => { await Task.Yield(); return 999; }), + new DurableBranch("beta", async (_) => { await Task.Yield(); return 999; }), + }, + name: "fanout"); + + Assert.Equal("alpha", result.All[0].Name); + Assert.Equal("beta", result.All[1].Name); + Assert.Equal(BatchItemStatus.Started, result.All[1].Status); + } + + [Fact] + public async Task ParallelAsync_ReplayWithDriftedBranchName_ThrowsNonDeterministic() + { + // A branch name that differs between the checkpoint and the current + // code indicates the branch set was reordered/renamed between + // deployments — surface it rather than silently reconstructing. + var parentOpId = IdAt(1); + var b0 = ChildIdAt(parentOpId, 1); + + var summaryJson = """ + {"CompletionReason":"ALL_COMPLETED","Units":[ + {"Index":0,"Name":"alpha","Status":"SUCCEEDED"} + ]} + """; + + var (context, _, _, _) = CreateContext(new InitialExecutionState + { + Operations = new List + { + new() + { + Id = parentOpId, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.Parallel, + Name = "fanout", + ContextDetails = new ContextDetails { Result = summaryJson } + }, + new() + { + Id = b0, + Type = OperationTypes.Context, + Status = OperationStatuses.Succeeded, + SubType = OperationSubTypes.ParallelBranch, + Name = "alpha", + ContextDetails = new ContextDetails { Result = "10" } + } + } + }); + + await Assert.ThrowsAsync(() => + context.ParallelAsync( + new[] + { + // Renamed from "alpha" → "renamed" since the checkpoint. + new DurableBranch("renamed", async (_) => { await Task.Yield(); return 999; }), + }, + name: "fanout")); + } + +} diff --git a/MAP-IMPLEMENTATION-PLAN.md b/MAP-IMPLEMENTATION-PLAN.md new file mode 100644 index 000000000..ab6d6e915 --- /dev/null +++ b/MAP-IMPLEMENTATION-PLAN.md @@ -0,0 +1,234 @@ +# MapAsync Implementation Plan (.NET Durable Execution SDK) — Wave 2 + +Tracking: follow-up to `ParallelAsync` (DOTNET-8662). This document is the +agreed plan before any code is written. It captures the cross-SDK research, +the locked-in decisions, and the concrete file-by-file changes. + +--- + +## 1. Background & research summary + +`MapAsync` processes a collection in parallel with configurable concurrency. +It is the sibling of the already-shipped `ParallelAsync`. The design doc +(`Docs/durable-execution-design.md`) specifies the public surface: + +```csharp +Task> MapAsync( + IReadOnlyList items, + Func, Task> func, + string? name = null, + MapConfig? config = null, + CancellationToken cancellationToken = default); +``` + +### Cross-SDK findings (Python / JavaScript / Java) + +| Aspect | Python | JavaScript | Java | Conclusion for .NET | +|--------|--------|------------|------|---------------------| +| Map vs Parallel | Siblings over shared `ConcurrentExecutor` | Siblings over shared `executeItemsConcurrently` engine | Siblings over shared `ConcurrencyOperation` | **Extract a shared base; Map & Parallel are thin subclasses.** | +| Per-item callback | `(ctx, item, index, items)` | `(ctx, item, index, array)` | `(item, index, ctx)` | Our design doc uses **`(ctx, item, index, allItems)`** — matches Python/JS (context-first). ✅ | +| Item → branch | 1 item = 1 child context | 1 item = 1 child context | 1 item = 1 child context | Same — reuse `ChildContextOperation` per item. | +| `ItemBatcher` | Config dataclass, **never wired into execution** | **Does not exist** | **Does not exist** | **Remove entirely** (decision below). | +| Default `CompletionConfig` | `all_successful()` (Parallel), permissive (Map) | fail-fast (both) | `allCompleted()` (both) | **Map default = `AllCompleted()`** (Python/Java majority); Parallel stays `AllSuccessful()`. | +| `toleratedFailurePercentage` units | 0–100 | 0–1 | 0–1 | Ours is **0.0–1.0** (already validated in `CompletionConfig`). ✅ | +| Subtypes | `MAP` / `MAP_ITERATION` | `MAP` / `MAP_ITERATION` | `MAP` / `MAP_ITERATION` | Add `Map` / `MapItem` constants. | +| Naming | `map-item-{i}` or `item_namer(item,i)` | `map-item-{i}` or `itemNamer(item,i)` | `{name}-iteration-{i}` | `ItemNamer(item, index)`; default = index string (consistent with Parallel's branch naming). | +| Empty collection | empty result, `ALL_COMPLETED` | empty result, `ALL_COMPLETED` | empty result (not replayable) | Empty → empty `BatchResult`, `AllCompleted`. | + +### Locked-in decisions (from user) + +1. **Extract a shared `ConcurrentOperation` base class.** Parallel and Map + become thin subclasses. (All three reference SDKs do this.) +2. **Remove `ItemBatcher` entirely** — no reference SDK implements it. Strip it + from `MapConfig` AND from the design doc. +3. **`MapConfig.CompletionConfig` defaults to `AllCompleted()`** (permissive), + matching Python + Java Map. Parallel's `AllSuccessful()` default is correct + and stays as-is (matches Python + JS Parallel). + +### Decisions NOT revisited + +- **Parallel default `AllSuccessful()`** — confirmed correct (Python + JS + majority). Not changing. +- **Empty `CompletionConfig` = permissive in .NET** (vs JS's empty = fail-fast). + Deliberate per DESIGN-QUESTIONS.md Q3 / REVIEW.md. Our model uses explicit + named factories (`AllSuccessful()` = `{ToleratedFailureCount=0}`, + `AllCompleted()` = empty). Map's permissive default is the explicit + `AllCompleted()` factory, so it never depends on the empty-config edge case. +- **One `MapAsync` overload** (not the 4 in the stale DESIGN-QUESTIONS.md). The + shipped serializer model pulls `ILambdaSerializer` from + `ILambdaContext.Serializer` via `LambdaSerializerHelper.GetRequired`, so the + `ICheckpointSerializer` AOT overloads do not apply. The design doc's single + signature is authoritative. + +--- + +## 2. Reuse map (what Map borrows from Parallel) + +| Component | Action | +|-----------|--------| +| `DurableOperation` base | Reuse unchanged | +| `ExecutionState` (thread-safe, `_lock`-guarded) | Reuse unchanged — REVIEW.md race already fixed | +| `OperationIdGenerator` / `HashOperationId` | Reuse unchanged — child IDs derived as `Hash($"{OperationId}-{index+1}")` in the base | +| `ChildContextOperation` | Reuse unchanged — each item runs as one child context | +| `BatchResult` / `BatchItem` | Reuse unchanged | +| `IBatchResult` / `IBatchItem` / `BatchItemStatus` | Reuse unchanged | +| `CompletionConfig` / `CompletionReason` / `NestingType` | Reuse unchanged | +| `ParallelSummary` / `ParallelJsonContext` | Generalize into a shared `BatchSummary` (see Step 3) | + +--- + +## 3. Implementation steps (ordered) + +### Step 1 — Extract `ConcurrentOperation` base class +**New file:** `Internal/ConcurrentOperation.cs` + +Move the reusable core out of `Internal/ParallelOperation.cs` (currently +lines 70–637) into an abstract base `ConcurrentOperation : DurableOperation>`: + +- `StartAsync` — sync-flush parent CONTEXT START (using `ParentSubType`), then `ExecuteItemsAsync`. +- `ReplayAsync` — the 4-way status dispatch (Succeeded → reconstruct; Failed → reconstruct + throw via `BuildException`; Started/Pending → re-execute; else `NonDeterministicExecutionException`). +- `ExecuteItemsAsync` — the full dispatch loop: `SemaphoreSlim` concurrency, the + orphan-task-safe `try/finally` that awaits all in-flight tasks before disposing + the semaphore, short-circuit checks, completion-reason computation, parent + checkpoint, throw-on-tolerance-exceeded. +- `RunUnitAsync(index, ...)` — wraps one unit in a `ChildContextOperation` + (child ID = `Hash($"{OperationId}-{index+1}")`, subtype = `ChildSubType`), + with the existing per-branch exception capture (ChildContextException → Failed + slot; structural DurableExecutionException → rethrow; OCE handling). +- `ShouldStopDispatching`, `ComputeCompletionReason`, `BranchOutcome` struct, + wire (de)serialization helpers, `DeserializeResult`, `CheckpointParentResultAsync`, + `ReconstructFromCheckpoints` — all move down. + +**Abstract/virtual hooks subclasses implement:** +```csharp +protected abstract int UnitCount; +protected abstract string ParentSubType; // OperationSubTypes.Parallel / .Map +protected abstract string ChildSubType; // .ParallelBranch / .MapItem +protected abstract (string? name, Func> func) GetUnit(int index); +protected abstract DurableExecutionException BuildException(IBatchResult result); +``` + +`ParallelOperation` then shrinks to: store `branches`, return +`OperationSubTypes.Parallel`/`ParallelBranch`, `GetUnit(i)` → `(branches[i].Name, branches[i].Func)`, +`BuildException` → `ParallelException`. **Existing 193 tests are the regression net.** + +### Step 2 — Operation subtype constants +**Edit:** `Operation.cs` → add to `OperationSubTypes`: +```csharp +public const string Map = "Map"; +public const string MapItem = "MapItem"; +``` + +### Step 3 — Generalize the checkpoint summary +**Edit:** `Internal/ParallelSummary.cs` → rename to shared `BatchSummary` / +`BatchUnitSummary` (or keep names, just broaden the doc comment). The shape +(`CompletionReason` + `[{Index, Name, Status}]`) is identical for both. +**Edit:** `Internal/ParallelJsonContext.cs` → rename to `BatchJsonContext` (one +shared source-gen context). Both subclasses use it via the base. Keeps a single +wire format and avoids drift. + +> Note: REVIEW.md issue #3 — `ParallelBranchSummary.OperationId` is dead. While +> generalizing, drop that field (smaller checkpoints) since reconstruction +> recomputes the ID by index. Confirm it isn't present before removing. + +### Step 4 — `MapConfig` + `MapException` +**New file:** `MapConfig.cs` — mirrors `ParallelConfig`: +- `int? MaxConcurrency` with `<= 0` rejection (same setter as ParallelConfig). +- `CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted();` ← **the key difference**. +- `NestingType NestingType { get; set; } = NestingType.Nested;` (Flat throws `NotSupportedException` at run). +- `Func? ItemNamer { get; set; }` — receives `(item, index)`, returns the item's branch name. **No `ItemBatcher`.** +- XML doc frames the permissive default as Python/Java parity: "Map collects all results by default; pass `CompletionConfig.AllSuccessful()` for fail-fast." + +**Edit:** `DurableExecutionException.cs` — add `MapException : DurableExecutionException` +mirroring `ParallelException` (carries `IBatchResult? Result`, `CompletionReason`, +the three ctors). Lets `catch` distinguish Map from Parallel failures. + +### Step 5 — `MapOperation` +**New file:** `Internal/MapOperation.cs` — `: ConcurrentOperation`. +Holds `items`, `func` (`Func, Task>`), +and `ItemNamer`. Implements the hooks: +- `UnitCount => items.Count` +- `ParentSubType => OperationSubTypes.Map`, `ChildSubType => OperationSubTypes.MapItem` +- `GetUnit(i)` → name = `ItemNamer?.Invoke(items[i]!, i) ?? i.ToString(InvariantCulture)`; + func = `ctx => func(ctx, items[i], i, items)` +- `BuildException` → `MapException` + +~60 lines. + +### Step 6 — Wire into the context +**Edit:** `IDurableContext.cs` — add the single `MapAsync` overload +(exact design-doc signature) with XML docs mirroring the `ParallelAsync` style. + +**Edit:** `DurableContext.cs` — add `MapAsync` + private `RunMap` +(mirrors `RunParallel`, lines 206–240): null/empty-arg validation, `Flat` guard, +serializer fetch, construct `MapOperation`, `ExecuteAsync`. Empty `items` → empty +`BatchResult` with `AllCompleted` (handled naturally by the base when `UnitCount == 0`). + +### Step 7 — Tests +**New file:** `test/.../MapOperationTests.cs` — mirror `ParallelOperationTests.cs` +(same `CreateContext` harness with `TestLambdaContext` + `DefaultLambdaJsonSerializer` ++ `RecordingBatcher`). Cover: +- Happy path (all items succeed, results in index order). +- Per-item failure capture under default `AllCompleted()` → **no throw**, failure in `IBatchResult.Failed`. +- `AllSuccessful()` override → one failure throws `MapException`. +- `ItemNamer` produces expected `IBatchItem.Name`; default naming = index. +- Empty collection → empty result, `AllCompleted`, no parent throw. +- `MaxConcurrency` (incl. the `>= count` no-semaphore optimization). +- `FirstSuccessful()` / `MinSuccessful` short-circuit → unfinished items = `Started`. +- Replay determinism: two fresh runs → identical item operation IDs. +- Replay from parent=SUCCEEDED → reconstruct results from child checkpoints. +- Mixed-status replay (some SUCCEEDED, some STARTED in summary). + +**New (DONE):** integration `test/.../IntegrationTests/TestFunctions/Map*` + +matching `Map*Test.cs`, mirroring the `Parallel*` set: HappyPath, PartialFailure +(permissive-default, the headline Map-vs-Parallel difference), FailureTolerance +(asserts `MapException`), FirstSuccessful, MaxConcurrency, ReplayDeterminism. All +6 function projects and the IntegrationTests assembly compile; the tests deploy +real Lambdas and require live AWS credentials to run. + +Re-run the **full suite on net8.0 + net10.0** to confirm the Step 1 base +extraction did not regress Parallel. + +### Step 8 — Documentation cleanup +**Edit:** `Docs/durable-execution-design.md`: +- Remove all `ItemBatcher` / `Batcher` references: the `MapConfig` block + (~lines 1369–1399), the cross-SDK "Item batching" row (~line 2132), and any + pipeline example using a batcher. +- Correct the `MapConfig.CompletionConfig` default in the doc to `AllCompleted()`. +- Note the (intentional) Parallel `AllSuccessful` vs Map `AllCompleted` default split. + +**Edit (optional):** annotate `DESIGN-QUESTIONS.md` stale bits (the +`ICheckpointSerializer` 4-overload section and any `ItemBatcher` mention) so the +record stays accurate. + +--- + +## 4. Intentional divergences (documented, not bugs) + +1. **Map default `AllCompleted()` vs Parallel default `AllSuccessful()`** — each + follows its own reference-SDK majority (Map: Python+Java; Parallel: Python+JS). +2. **One `MapAsync` overload** — superseded the stale 4-overload AOT design. +3. **`MapException`** is its own type (not reused `ParallelException`) so callers + can pattern-match the operation that failed. +4. **No `ItemBatcher`** — does not exist in JS/Java; inert in Python. + +--- + +## 5. File change checklist + +**New:** +- `Internal/ConcurrentOperation.cs` +- `Internal/MapOperation.cs` +- `MapConfig.cs` +- `test/.../MapOperationTests.cs` +- `test/.../IntegrationTests/TestFunctions/Map*` (×~6) + +**Edited:** +- `Internal/ParallelOperation.cs` (slimmed to subclass) +- `Internal/ParallelSummary.cs` → shared `BatchSummary` +- `Internal/ParallelJsonContext.cs` → shared `BatchJsonContext` +- `Operation.cs` (+2 subtype constants) +- `DurableExecutionException.cs` (+`MapException`) +- `IDurableContext.cs` (+`MapAsync` overload + docs) +- `DurableContext.cs` (+`MapAsync` + `RunMap`) +- `Docs/durable-execution-design.md` (remove ItemBatcher, fix default)