aws · GarrettBeatty · May 14, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
@@ -0,0 +1,11 @@
+{
+  "Projects": [
+    {
+      "Name": "Amazon.Lambda.DurableExecution",
+      "Type": "Minor",
+      "ChangelogMessages": [
+        "Implement NestingType.Flat for ParallelAsync and MapAsync (previously threw NotSupportedException). Under Flat, each branch/item runs in a virtual context that emits no per-branch CONTEXT checkpoint; per-branch results and errors are recorded inline on the parent operation's payload, reducing checkpoint volume. Operations inside a flat branch (steps, waits) still checkpoint, re-parented to the parallel/map operation. NestingType.Nested remains the default."
+      ]
+    }
+  ]
+}
@@ -0,0 +1,11 @@
+{
+  "Projects": [
+    {
+      "Name": "Amazon.Lambda.DurableExecution",
+      "Type": "Patch",
+      "ChangelogMessages": [
+        "Add `MapAsync` to `IDurableContext` for processing a collection in parallel with one child context per item and automatic checkpointing. Supports configurable max concurrency, completion policy, and per-item naming via `MapConfig`, returning an `IBatchResult<T>`."
+      ]
+    }
+  ]
+}
@@ -0,0 +1,11 @@
+{
+  "Projects": [
+    {
+      "Name": "Amazon.Lambda.DurableExecution",
+      "Type": "Patch",
+      "ChangelogMessages": [
+        "Add `ParallelAsync` to `IDurableContext` for running multiple workflow branches concurrently with automatic checkpointing. Supports configurable max concurrency, failure tolerance, and first-successful completion via `ParallelConfig`, returning an `IBatchResult<T>`."
+      ]
+    }
+  ]
+}
@@ -559,7 +559,7 @@ For better observability, you can name individual branches (matching the JS SDK
 ```csharp
 // Named branches for easier debugging and testing
 var results = await context.ParallelAsync(
-    new NamedBranch<object>[]
+    new DurableBranch<object>[]
     {
         new("fetch_user", async (ctx) => await ctx.StepAsync(async (step) => await FetchUserData(userId))),
         new("fetch_orders", async (ctx) => await ctx.StepAsync(async (step) => await FetchOrderHistory(userId))),
@@ -1357,22 +1357,21 @@ public class MapConfig
     public int? MaxConcurrency { get; set; }
 
     /// <summary>
-    /// When to consider the operation complete.
+    /// When to consider the operation complete. Defaults to AllCompleted() —
+    /// every item runs regardless of per-item failures, which surface via
+    /// IBatchResult&lt;T&gt;.Failed rather than throwing. This permissive default
+    /// matches the Python and Java SDKs' map operation. It differs intentionally
+    /// from ParallelConfig.CompletionConfig, which defaults to AllSuccessful()
+    /// (fail-fast). For fail-fast map behavior, set this to
+    /// CompletionConfig.AllSuccessful() or call IBatchResult&lt;T&gt;.ThrowIfError().
     /// </summary>
-    public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllSuccessful();
+    public CompletionConfig CompletionConfig { get; set; } = CompletionConfig.AllCompleted();
 
     /// <summary>
     /// How item branches are represented in the checkpoint graph.
     /// </summary>
     public NestingType NestingType { get; set; } = NestingType.Nested;
 
-    /// <summary>
-    /// Optional batching configuration for grouping items before processing.
-    /// When set, items are grouped into batches and each batch is processed as a unit.
-    /// Reduces checkpoint overhead for large collections.
-    /// </summary>
-    public ItemBatcher? Batcher { get; set; }
-
     /// <summary>
     /// Optional function to generate a custom name for each item's branch.
     /// Improves observability in execution traces. Receives the item and its index.
@@ -1381,30 +1380,20 @@ public class MapConfig
     public Func<object, int, string>? ItemNamer { get; set; }
 }
 
-/// <summary>
-/// Groups items into batches for map operations to reduce checkpoint overhead.
-/// At least one of MaxItemsPerBatch or MaxBytesPerBatch must be set.
-/// </summary>
-public class ItemBatcher
-{
-    /// <summary>
-    /// Maximum number of items per batch. Null = no count limit.
-    /// </summary>
-    public int? MaxItemsPerBatch { get; set; }
-
-    /// <summary>
-    /// Maximum serialized size (bytes) per batch. Null = no size limit.
-    /// </summary>
-    public int? MaxBytesPerBatch { get; set; }
-}
-
 /// <summary>
 /// Defines completion criteria for parallel/map operations.
 /// </summary>
 public class CompletionConfig
 {
     public int? MinSuccessful { get; set; }
     public int? ToleratedFailureCount { get; set; }
+    /// <summary>
+    /// Maximum tolerated failure ratio, expressed as a value in the range
+    /// <c>0.0</c> to <c>1.0</c> (inclusive). For example, <c>0.25</c> means
+    /// "tolerate up to 25% failures; fail when the failure ratio strictly
+    /// exceeds 25%". <c>null</c> = no ratio-based threshold. Validated by the
+    /// setter; out-of-range values throw <see cref="ArgumentOutOfRangeException"/>.
+    /// </summary>
     public double? ToleratedFailurePercentage { get; set; }
 
     public static CompletionConfig AllSuccessful() => new() { ToleratedFailureCount = 0 };
@@ -2122,7 +2111,6 @@ All four SDKs expose the same core operations. The differences are naming conven
 | Jitter strategy | `JitterStrategy` enum on `Exponential()` | `jitter_strategy` on `RetryStrategyConfig` | `jitter` on `createRetryStrategy()` |
 | Retry presets | `RetryStrategy.None/Default/Transient` | `RetryPresets.none()/default()/transient()` | `retryPresets.default/linear/noRetry` |
 | Nesting type | `NestingType` on `ParallelConfig`/`MapConfig` | `NestingType` on parallel/map config | `NestingType` on parallel/map config |
-| Item batching | `ItemBatcher` on `MapConfig` | `ItemBatcher` on `MapConfig` | *(checkpoint manager handles batching)* |
 | Item namer | `ItemNamer` on `MapConfig` | Item naming function on `MapConfig` | `itemNamer` on `MapConfig` |
 | Error mapping | `ErrorMapping` on `ChildContextConfig` | *(typed exception wrapping)* | `errorMapping` on child context config |
 | Message-based retry filter | `retryableMessagePatterns` (regex) | `retryable_errors` (regex) | `retryableErrors` (RegExp[]) |

@@ -0,0 +1,31 @@
+namespace Amazon.Lambda.DurableExecution;
+
+/// <summary>
+/// Status of an individual item in a <see cref="IBatchResult{T}"/>.
+/// </summary>
+/// <remarks>
+/// Mirrors the wire-state of the per-branch checkpoint at the moment the batch
+/// resolved. Items that finished produce <see cref="Succeeded"/> or
+/// <see cref="Failed"/>; items that were not dispatched because a
+/// <see cref="CompletionConfig"/> short-circuit fired are reported as
+/// <see cref="Started"/>.
+/// </remarks>
+public enum BatchItemStatus
+{
+    /// <summary>
+    /// The branch ran to completion and produced a result.
+    /// </summary>
+    Succeeded,
+
+    /// <summary>
+    /// The branch ran to completion and threw.
+    /// </summary>
+    Failed,
+
+    /// <summary>
+    /// The branch was not dispatched before the batch's <see cref="CompletionConfig"/>
+    /// resolved (e.g., <see cref="CompletionConfig.FirstSuccessful"/> short-circuited
+    /// before this branch was started), or no per-branch checkpoint exists on replay.
+    /// </summary>
+    Started
+}
@@ -0,0 +1,151 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What this is
+
+`Amazon.Lambda.DurableExecution` is the .NET SDK (preview, 0.x) for resilient, long-running AWS Lambda
+workflows that checkpoint progress after each step and resume after failures or waits. A workflow can run
+for up to ~1 year (the WAIT cap is 31,622,400 seconds) and is only billed for active compute. The SDK is
+client-side glue: the *durable execution service* (part of Lambda) owns the checkpoint store, fires timers,
+and re-invokes the function; this library re-derives in-memory workflow position from the checkpoint history
+the service sends on each invocation. See sibling SDKs (Python/JS/Java) listed in `README.md` for the shared
+model — this SDK deliberately mirrors their semantics.
+
+## Build & test
+
+Targets `net8.0;net10.0` (`DefaultPackageTargets` in `buildtools/common.props`). `TreatWarningsAsErrors` is on
+everywhere, and the main library is `IsTrimmable` with the trim analyzer enabled — keep new code AOT/trim-clean.
+
+```bash
+# Build the library (run from this directory)
+dotnet build
+
+# Unit tests (fast, no AWS). Project: Libraries/test/Amazon.Lambda.DurableExecution.Tests
+dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj
+
+# A single test
+dotnet test ../../test/Amazon.Lambda.DurableExecution.Tests/Amazon.Lambda.DurableExecution.Tests.csproj \
+  --filter "FullyQualifiedName~StepOperationTests"
+
+# Coverage report (requires reportgenerator tool)
+../../test/Amazon.Lambda.DurableExecution.Tests/coverage.sh
+```
+
+Unit tests reach `internal` types via `InternalsVisibleTo` (declared in the `.csproj`). They use
+`Amazon.Lambda.TestUtilities` (`TestLambdaContext`) and the real `SourceGeneratorLambdaJsonSerializer` —
+set `TestLambdaContext.Serializer` so `LambdaSerializerHelper.GetRequired` finds one.
+
+### Integration tests (expensive, real AWS)
+
+`Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests` deploys real Lambdas. Each test builds a
+`TestFunctions/<X>/` project into a container image via **`dotnet publish` + `docker build`**, pushes to ECR,
+creates an IAM role + Lambda (`DurableFunctionDeployment`), invokes it, and tears everything down on dispose.
+Requires Docker, AWS creds (us-east-1), and is slow. Every behavior in `docs/` should have a paired
+integration test under that project. Prefix AWS commands with `unset AWS_PROFILE` to use `[default]` creds.
+
+**Run integration tests against `net10.0`.** The project multi-targets `net8.0;net10.0`; `dotnet test`
+without a framework spins up one testhost per TFW and runs them concurrently, which races two processes on
+the same `TestFunctions/<X>/` build dir. Pin the framework:
+
+```bash
+dotnet test ../../test/Amazon.Lambda.DurableExecution.IntegrationTests/Amazon.Lambda.DurableExecution.IntegrationTests.csproj \
+  -f net10.0 --filter "FullyQualifiedName~MultipleStepsTest"
+```
+
+## Architecture: the replay model
+
+This is the part you must understand before changing anything. Read these together:
+`DurableFunction.cs`, `DurableExecutionHandler.cs`, `DurableContext.cs`, `Internal/DurableOperation.cs`,
+`Internal/ExecutionState.cs`, `Internal/OperationIdGenerator.cs`, `Internal/TerminationManager.cs`.
+
+**Entry point.** The user's Lambda handler delegates to `DurableFunction.WrapAsync<TInput,TOutput>`, which:
+hydrates `ExecutionState` from `invocationInput.InitialExecutionState` (paging the service via `NextMarker`),
+extracts the user payload from the `EXECUTION`-type op, builds a `CheckpointBatcher` + `DurableContext`, runs
+the workflow through `DurableExecutionHandler.RunAsync`, drains checkpoints, and maps the result to a
+`DurableExecutionInvocationOutput` with status **Succeeded / Failed / Pending**.
+
+**Each operation runs the same workflow code every invocation.** There is no persisted program counter.
+On re-invocation the user function executes from the top again; each durable call (`StepAsync`, `WaitAsync`,
+etc.) looks up its own checkpoint and either replays the cached result or runs fresh. This is why workflow
+code **must be deterministic** — same operations, same order, same names across deployments.
+
+**Deterministic operation IDs** (`OperationIdGenerator`). Each durable call gets an ID = SHA-256 of
+`"<parentPrefix>-<counter>"`, where the counter is per-context and pre-incremented. The same workflow position
+yields the same opaque ID across replays, so a checkpoint correlates to a call by *position*, not by name —
+renaming a step does **not** break replay (the human name rides separately on `OperationUpdate.Name`).
+Reordering or adding/removing calls *does* break it. `ValidateReplayConsistency` enforces this and throws
+`NonDeterministicExecutionException` on type/name drift.
+
+**Suspension is implemented by never completing a Task** (`TerminationManager` + `DurableExecutionHandler`).
+When an op must suspend (wait timer, scheduled retry, pending callback/invoke) it calls
+`Termination.SuspendAndAwait<T>()`, which trips a one-shot signal and returns a Task that *never resolves*.
+`RunAsync` runs the user code via `Task.Run` and races it against `TerminationTask` with `Task.WhenAny`:
+- user task wins → **Succeeded** (or **Failed** if it threw)
+- termination wins → **Pending**; the abandoned user task is GC'd, checkpoints flush, the service fires the
+  timer and re-invokes. On replay the suspended op sees its now-terminal checkpoint and returns normally.
+
+**Operation classes** (`Internal/*Operation.cs`) all extend `DurableOperation<TResult>`. The base's
+`ExecuteAsync` does: `ValidateReplayConsistency` → `TrackReplay` → look up checkpoint → dispatch to
+`StartAsync` (no prior checkpoint) or `ReplayAsync` (checkpoint exists). `StepOperation` is the canonical
+example — read its class doc comment for the full status decision table (Succeeded→cached, Failed→rethrow,
+Pending→re-suspend if retry timer hasn't fired, Started→crash-recovery under `AtMostOncePerRetry`,
+Ready→run next attempt). `DurableContext` is a thin dispatcher: it allocates the op ID, pulls the serializer
+off `ILambdaContext.Serializer`, constructs the right `*Operation`, and calls `ExecuteAsync`.
+
+**Checkpointing** (`CheckpointBatcher`). Outbound `OperationUpdate`s (START/SUCCEED/FAIL/RETRY) are enqueued
+to a background channel worker that batches and flushes them via `LambdaDurableServiceClient` (which wraps
+the `AWSSDK.Lambda` `Checkpoint`/`GetExecutionState` calls). `EnqueueAsync` awaits its batch's flush
+(sync semantics); fire-and-forget callers (e.g. the START checkpoint under the default
+`AtLeastOncePerRetry`) don't await but must observe the Task's exception. Flush errors become a terminal
+error rethrown by the next `EnqueueAsync`/`DrainAsync`. `DurableFunction.IsTerminalCheckpointError`
+classifies SDK errors on the final drain: 4xx (except 429 and stale-token) → **Failed** envelope; 429/5xx/
+network → let it escape so Lambda retries the whole invocation.
+
+**Replay-mode tracking** (`ExecutionState`). `IsReplaying` starts true iff any completed non-`EXECUTION` op
+exists; `TrackReplay` decrements as each is visited and flips to false once the workflow catches up to the
+frontier. `ReplayAwareLogger` uses this to suppress log lines emitted during replay so a 30-step workflow
+re-invoked 30 times logs each line once — **always use `ctx.Logger`**, never `Console.WriteLine`.
+`ExecutionState` is lock-guarded because the batcher worker thread and concurrent parallel/map branches all
+touch it.
+
+### Operations surface (`IDurableContext`)
+
+`StepAsync` (checkpointed code + retries), `WaitAsync` (1s–~1yr timer), `RunInChildContextAsync` (isolated
+sub-workflow checkpointed as one `CONTEXT` op), `CreateCallbackAsync` / `WaitForCallbackAsync` (external
+events; `WaitForCallback` is *composed* from child-context + callback + submitter step — see
+`DurableContext.RunWaitForCallback`), `InvokeAsync` (durable-to-durable chained invoke, qualified ARN
+required), and `ParallelAsync` / `MapAsync` (concurrent branches → `IBatchResult<T>`).
+
+**Nesting (`NestingType`)** matters for parallel/map. `Nested` (default) gives each branch a full `CONTEXT`
+checkpoint. `Flat` runs branches in *virtual* contexts that emit no `CONTEXT` op — inner ops re-parent to the
+parallel/map op via `OperationIdGenerator.CreateVirtualChild(operationId, reportedParentId)`, trading trace
+granularity for fewer checkpoints. The `idPrefix` vs `reportedParentId` split is the subtle part: inner IDs
+always derive from the branch's own op ID (so siblings never collide), but are *reported* under the nearest
+non-virtual ancestor (so they reference a parent that actually exists in the checkpoint store).
+
+### Wire format (`Operation.cs`)
+
+`Operation` and its `*Details` types mirror the service envelope JSON exactly (`[JsonPropertyName]`).
+String constants live in `OperationTypes` (STEP/WAIT/CALLBACK/CHAINED_INVOKE/CONTEXT/EXECUTION),
+`OperationStatuses` (STARTED/SUCCEEDED/FAILED/PENDING/READY/CANCELLED/STOPPED/TIMED_OUT), and
+`OperationSubTypes` (PascalCase finer classifier). Plural type names (`OperationTypes`, not `OperationType`)
+intentionally avoid collision with `AWSSDK.Lambda` model enums.
+
+## Conventions
+
+- **Programming model:** preview supports only the *executable* model — `Main` builds a `LambdaBootstrap`
+  with a handler wrapper and an `ILambdaSerializer`. The serializer is read off `ILambdaContext.Serializer`
+  (a preview API; the project-wide `AWSLAMBDA001` suppression in the `.csproj` is intentional for that
+  reason). All step/result/payload (de)serialization flows through that one registered serializer, so AOT
+  and reflection callers share a single code path — there is no per-call `JsonSerializerContext` argument.
+- **Errors:** durable exceptions carry `ErrorType`/`ErrorData`/`OriginalStackTrace` so a failure can be
+  reconstructed on replay when the live exception object is gone. `StepException`, `ChildContextException`,
+  `CallbackFailedException`/`CallbackTimeoutException`/`CallbackSubmitterException`, `ParallelException`,
+  `MapException`, and `NonDeterministicExecutionException` all derive from `DurableExecutionException`.
+  When adding error-mapping logic, handle *both* the fresh path (`InnerException` is the live exception) and
+  the replay path (`InnerException` is null, `ErrorType` carries the type string) — see
+  `DurableContext.MapWaitForCallbackException` for the pattern.
+- **Public config types** (`StepConfig`, `WaitForCallbackConfig`, `ParallelConfig`, `MapConfig`,
+  `CompletionConfig`, etc.) are nullable optional args; resolve to an effective config inside the dispatcher.
+- Inclusive language is enforced repo-wide (see the user's global rules): no master/slave, whitelist/blacklist.